!4597 [Dataset] C++ API Support for build_vocab
This commit is contained in:
parent
9f06974d0f
commit
b50ae27c72
|
@ -26,4 +26,5 @@ add_library(cpp-API OBJECT
|
|||
iterator.cc
|
||||
transforms.cc
|
||||
samplers.cc
|
||||
text.cc
|
||||
)
|
||||
|
|
|
@ -34,6 +34,7 @@
|
|||
#include "minddata/dataset/engine/datasetops/source/voc_op.h"
|
||||
// Dataset operator headers (in alphabetical order)
|
||||
#include "minddata/dataset/engine/datasetops/batch_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/build_vocab_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/concat_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/map_op/map_op.h"
|
||||
#include "minddata/dataset/engine/datasetops/project_op.h"
|
||||
|
@ -263,6 +264,37 @@ std::shared_ptr<BatchDataset> Dataset::Batch(int32_t batch_size, bool drop_remai
|
|||
return ds;
|
||||
}
|
||||
|
||||
// Function to create a Vocab from dataset
|
||||
std::shared_ptr<Vocab> Dataset::BuildVocab(const std::vector<std::string> &columns,
|
||||
const std::pair<int64_t, int64_t> &freq_range, int64_t top_k,
|
||||
const std::vector<std::string> &special_tokens, bool special_first) {
|
||||
auto vocab = std::make_shared<Vocab>();
|
||||
auto ds = std::make_shared<BuildVocabDataset>(vocab, columns, freq_range, top_k, special_tokens, special_first);
|
||||
|
||||
if (!ds->ValidateParams()) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
ds->children.push_back(shared_from_this());
|
||||
|
||||
// Run tree here to starting building vocab
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
if (iter == nullptr) {
|
||||
MS_LOG(ERROR) << "Fail to run iterator in BuildVocab.";
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Finish building vocab by triggering GetNextRow
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
if (vocab == nullptr) {
|
||||
MS_LOG(ERROR) << "Fail to build vocab.";
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return vocab;
|
||||
}
|
||||
|
||||
// Function to create a Concat dataset
|
||||
std::shared_ptr<ConcatDataset> Dataset::Concat(const std::vector<std::shared_ptr<Dataset>> &datasets) {
|
||||
auto ds = std::make_shared<ConcatDataset>(datasets);
|
||||
|
@ -1450,13 +1482,52 @@ std::vector<std::shared_ptr<DatasetOp>> BatchDataset::Build() {
|
|||
|
||||
bool BatchDataset::ValidateParams() {
|
||||
if (batch_size_ <= 0) {
|
||||
MS_LOG(ERROR) << "Batch: Batch size cannot be negative";
|
||||
MS_LOG(ERROR) << "Batch: batch_size should be positive integer, but got: " << batch_size_;
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
BuildVocabDataset::BuildVocabDataset(std::shared_ptr<Vocab> vocab, const std::vector<std::string> &columns,
|
||||
const std::pair<int64_t, int64_t> &freq_range, int64_t top_k,
|
||||
const std::vector<std::string> &special_tokens, bool special_first)
|
||||
: vocab_(vocab),
|
||||
columns_(columns),
|
||||
freq_range_(freq_range),
|
||||
top_k_(top_k),
|
||||
special_tokens_(special_tokens),
|
||||
special_first_(special_first) {}
|
||||
|
||||
// Function to build BuildVocabDataset
|
||||
std::vector<std::shared_ptr<DatasetOp>> BuildVocabDataset::Build() {
|
||||
// A vector containing shared pointer to the Dataset Ops that this object will create
|
||||
std::vector<std::shared_ptr<DatasetOp>> node_ops;
|
||||
|
||||
std::shared_ptr<BuildVocabOp> build_vocab_op;
|
||||
build_vocab_op = std::make_shared<BuildVocabOp>(vocab_, columns_, freq_range_, top_k_, special_tokens_,
|
||||
special_first_, num_workers_, connector_que_size_);
|
||||
node_ops.push_back(build_vocab_op);
|
||||
return node_ops;
|
||||
}
|
||||
|
||||
bool BuildVocabDataset::ValidateParams() {
|
||||
if (vocab_ == nullptr) {
|
||||
MS_LOG(ERROR) << "BuildVocab: vocab is null.";
|
||||
return false;
|
||||
}
|
||||
if (top_k_ < 0) {
|
||||
MS_LOG(ERROR) << "BuildVocab: top_k shoule be positive, but got: " << top_k_;
|
||||
return false;
|
||||
}
|
||||
if (freq_range_.first < 0 || freq_range_.second > kDeMaxFreq || freq_range_.first > freq_range_.second) {
|
||||
MS_LOG(ERROR) << "BuildVocab: requency_range [a,b] should be 0 <= a <= b (a,b are inclusive), "
|
||||
<< "but got [" << freq_range_.first << ", " << freq_range_.second << "]";
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Function to build ConcatOp
|
||||
ConcatDataset::ConcatDataset(const std::vector<std::shared_ptr<Dataset>> &datasets) : datasets_(datasets) {
|
||||
this->children = datasets_;
|
||||
|
|
|
@ -0,0 +1,64 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "minddata/dataset/include/text.h"
|
||||
#include "minddata/dataset/text/kernels/lookup_op.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
namespace api {
|
||||
namespace text {
|
||||
|
||||
std::shared_ptr<LookupOperation> Lookup(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token) {
|
||||
auto op = std::make_shared<LookupOperation>(vocab, unknown_token);
|
||||
|
||||
if (!op->ValidateParams()) {
|
||||
return nullptr;
|
||||
}
|
||||
return op;
|
||||
}
|
||||
|
||||
// LookupOperation
|
||||
LookupOperation::LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token)
|
||||
: vocab_(vocab), unknown_token_(unknown_token), default_id_(Vocab::kNoTokenExists) {}
|
||||
|
||||
bool LookupOperation::ValidateParams() {
|
||||
if (vocab_ == nullptr) {
|
||||
LOG(ERROR) << "Lookup: vocab object type is incorrect or null.";
|
||||
return false;
|
||||
}
|
||||
if (unknown_token_.empty()) {
|
||||
LOG(ERROR) << "Lookup: no unknown token is specified.";
|
||||
return false;
|
||||
} else {
|
||||
default_id_ = vocab_->Lookup(unknown_token_);
|
||||
if (default_id_ == Vocab::kNoTokenExists) {
|
||||
LOG(ERROR) << "Lookup: unknown_token: [" + unknown_token_ + "], does not exist in vocab.";
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
std::shared_ptr<TensorOp> LookupOperation::Build() {
|
||||
std::shared_ptr<LookupOp> tensor_op = std::make_shared<LookupOp>(vocab_, default_id_);
|
||||
return tensor_op;
|
||||
}
|
||||
|
||||
} // namespace text
|
||||
} // namespace api
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
|
@ -59,6 +59,8 @@ inline void BitClear(uint32_t *bits, uint32_t bitMask) { *bits &= (~bitMask); }
|
|||
|
||||
constexpr int32_t kDeMaxDim = std::numeric_limits<int32_t>::max(); // 2147483647 or 2^32 -1
|
||||
constexpr int32_t kDeMaxRank = std::numeric_limits<int32_t>::max();
|
||||
constexpr int64_t kDeMaxFreq = std::numeric_limits<int64_t>::max(); // 9223372036854775807 or 2^(64-1)
|
||||
constexpr int64_t kDeMaxTopk = std::numeric_limits<int64_t>::max();
|
||||
|
||||
constexpr uint32_t kCfgRowsPerBuffer = 1;
|
||||
constexpr uint32_t kCfgParallelWorkers = 4;
|
||||
|
|
|
@ -30,6 +30,7 @@
|
|||
#include "minddata/dataset/include/iterator.h"
|
||||
#include "minddata/dataset/include/samplers.h"
|
||||
#include "minddata/dataset/include/type_id.h"
|
||||
#include "minddata/dataset/text/vocab.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
@ -39,6 +40,7 @@ class DatasetOp;
|
|||
class DataSchema;
|
||||
class Tensor;
|
||||
class TensorShape;
|
||||
class Vocab;
|
||||
|
||||
namespace api {
|
||||
|
||||
|
@ -61,6 +63,7 @@ class TextFileDataset;
|
|||
class VOCDataset;
|
||||
// Dataset Op classes (in alphabetical order)
|
||||
class BatchDataset;
|
||||
class BuildVocabDataset;
|
||||
class ConcatDataset;
|
||||
class MapDataset;
|
||||
class ProjectDataset;
|
||||
|
@ -325,6 +328,24 @@ class Dataset : public std::enable_shared_from_this<Dataset> {
|
|||
/// \return Shared pointer to the current BatchDataset
|
||||
std::shared_ptr<BatchDataset> Batch(int32_t batch_size, bool drop_remainder = false);
|
||||
|
||||
/// \brief Function to create a Vocab from source dataset
|
||||
/// \notes Build a vocab from a dataset. This would collect all the unique words in a dataset and return a vocab
|
||||
/// which contains top_k most frequent words (if top_k is specified)
|
||||
/// \param[in] columns Column names to get words from. It can be a vector of column names
|
||||
/// \param[in] freq_range A tuple of integers (min_frequency, max_frequency). Words within the frequency
|
||||
/// range would be kept. 0 <= min_frequency <= max_frequency <= total_words. min_frequency/max_frequency
|
||||
/// can be set to default, which corresponds to 0/total_words separately
|
||||
/// \param[in] top_k Number of words to be built into vocab. top_k most frequent words are
|
||||
// taken. The top_k is taken after freq_range. If not enough top_k, all words will be taken
|
||||
/// \param[in] special_tokens A list of strings, each one is a special token
|
||||
/// \param[in] special_first Whether special_tokens will be prepended/appended to vocab, If special_tokens
|
||||
/// is specified and special_first is set to default, special_tokens will be prepended
|
||||
/// \return Shared pointer to the current Vocab
|
||||
std::shared_ptr<Vocab> BuildVocab(const std::vector<std::string> &columns = {},
|
||||
const std::pair<int64_t, int64_t> &freq_range = {0, kDeMaxFreq},
|
||||
int64_t top_k = kDeMaxTopk, const std::vector<std::string> &special_tokens = {},
|
||||
bool special_first = true);
|
||||
|
||||
/// \brief Function to create a ConcatDataset
|
||||
/// \notes Concat the datasets in the input
|
||||
/// \param[in] datasets List of shared pointers to the dataset that should be concatenated together
|
||||
|
@ -859,6 +880,33 @@ class BatchDataset : public Dataset {
|
|||
std::map<std::string, std::pair<TensorShape, std::shared_ptr<Tensor>>> pad_map_;
|
||||
};
|
||||
|
||||
class BuildVocabDataset : public Dataset {
|
||||
public:
|
||||
/// \brief Constructor
|
||||
BuildVocabDataset(std::shared_ptr<Vocab> vocab, const std::vector<std::string> &columns,
|
||||
const std::pair<int64_t, int64_t> &freq_range, int64_t top_k,
|
||||
const std::vector<std::string> &special_tokens, bool special_first);
|
||||
|
||||
/// \brief Destructor
|
||||
~BuildVocabDataset() = default;
|
||||
|
||||
/// \brief a base class override function to create the required runtime dataset op objects for this class
|
||||
/// \return The list of shared pointers to the newly created DatasetOps
|
||||
std::vector<std::shared_ptr<DatasetOp>> Build() override;
|
||||
|
||||
/// \brief Parameters validation
|
||||
/// \return bool true if all the params are valid
|
||||
bool ValidateParams() override;
|
||||
|
||||
private:
|
||||
std::shared_ptr<Vocab> vocab_;
|
||||
std::vector<std::string> columns_;
|
||||
std::pair<int64_t, int64_t> freq_range_;
|
||||
int64_t top_k_;
|
||||
std::vector<std::string> special_tokens_;
|
||||
bool special_first_;
|
||||
};
|
||||
|
||||
class ConcatDataset : public Dataset {
|
||||
public:
|
||||
/// \brief Constructor
|
||||
|
|
|
@ -0,0 +1,65 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_API_TEXT_H_
|
||||
#define MINDSPORE_CCSRC_MINDDATA_DATASET_API_TEXT_H_
|
||||
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include "minddata/dataset/core/constants.h"
|
||||
#include "minddata/dataset/include/transforms.h"
|
||||
#include "minddata/dataset/text/vocab.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
namespace api {
|
||||
|
||||
// Transform operations for text
|
||||
namespace text {
|
||||
|
||||
// Text Op classes (in alphabetical order)
|
||||
class LookupOperation;
|
||||
|
||||
/// \brief Lookup operator that looks up a word to an id.
|
||||
/// \param[in] vocab a Vocab object.
|
||||
/// \param[in] unknown_token word to use for lookup if the word being looked up is out of Vocabulary (oov).
|
||||
/// If unknown_token is oov, runtime error will be thrown
|
||||
/// \return Shared pointer to the current TensorOperation.
|
||||
std::shared_ptr<LookupOperation> Lookup(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token);
|
||||
|
||||
/* ####################################### Derived TensorOperation classes ################################# */
|
||||
|
||||
class LookupOperation : public TensorOperation {
|
||||
public:
|
||||
explicit LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token);
|
||||
|
||||
~LookupOperation() = default;
|
||||
|
||||
std::shared_ptr<TensorOp> Build() override;
|
||||
|
||||
bool ValidateParams() override;
|
||||
|
||||
private:
|
||||
std::shared_ptr<Vocab> vocab_;
|
||||
std::string unknown_token_;
|
||||
int32_t default_id_;
|
||||
};
|
||||
} // namespace text
|
||||
} // namespace api
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_API_TEXT_H_
|
|
@ -17,8 +17,10 @@
|
|||
#include <unordered_set>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <algorithm>
|
||||
|
||||
#include "minddata/dataset/text/vocab.h"
|
||||
#include "utils/log_adapter.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
@ -51,6 +53,147 @@ Status Vocab::BuildFromPyList(const py::list &words, const py::list &special_tok
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Vocab::BuildFromPyDict(const py::dict &words, std::shared_ptr<Vocab> *vocab) {
|
||||
std::unordered_map<WordType, WordIdType> word2id;
|
||||
for (auto p : words) {
|
||||
word2id[py::str(p.first)] = py::reinterpret_borrow<py::int_>(p.second);
|
||||
}
|
||||
*vocab = std::make_shared<Vocab>(std::move(word2id));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void Vocab::append_word(const std::string &word) {
|
||||
if (word2id_.find(word) == word2id_.end()) {
|
||||
word2id_[word] = word2id_.size();
|
||||
}
|
||||
}
|
||||
|
||||
Status Vocab::BuildFromUnorderedMap(const std::unordered_map<WordType, WordIdType> &words,
|
||||
std::shared_ptr<Vocab> *vocab) {
|
||||
// Validate parameters and build map
|
||||
std::unordered_map<WordType, WordIdType> word2id;
|
||||
for (auto p : words) {
|
||||
if (p.second < 0) {
|
||||
MS_LOG(ERROR) << "index can not be negetive, but got " << p.second;
|
||||
RETURN_STATUS_UNEXPECTED("index can not be negetive, but got " + std::to_string(p.second));
|
||||
}
|
||||
word2id[p.first] = p.second;
|
||||
}
|
||||
*vocab = std::make_shared<Vocab>(std::move(word2id));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Vocab::BuildFromVector(const std::vector<WordType> &words, const std::vector<WordType> &special_tokens,
|
||||
bool prepend_special, std::shared_ptr<Vocab> *vocab) {
|
||||
// Validate parameters
|
||||
std::string duplicate_word;
|
||||
for (const WordType &word : words) {
|
||||
if (std::count(words.begin(), words.end(), word) > 1) {
|
||||
if (duplicate_word.find(word) == std::string::npos) {
|
||||
duplicate_word = duplicate_word + ", " + word;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!duplicate_word.empty()) {
|
||||
MS_LOG(ERROR) << "words contains duplicate word: " << duplicate_word;
|
||||
RETURN_STATUS_UNEXPECTED("words contains duplicate word: " + duplicate_word);
|
||||
}
|
||||
|
||||
std::string duplicate_sp;
|
||||
for (const WordType &sp : special_tokens) {
|
||||
if (std::count(special_tokens.begin(), special_tokens.end(), sp) > 1) {
|
||||
if (duplicate_sp.find(sp) == std::string::npos) {
|
||||
duplicate_sp = duplicate_sp + ", " + sp;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!duplicate_sp.empty()) {
|
||||
MS_LOG(ERROR) << "special_tokens contains duplicate word: " << duplicate_sp;
|
||||
RETURN_STATUS_UNEXPECTED("special_tokens contains duplicate word: " + duplicate_sp);
|
||||
}
|
||||
|
||||
std::unordered_map<WordType, WordIdType> word2id;
|
||||
|
||||
// if special is added in front, normal words id will start from number of special tokens
|
||||
WordIdType word_id = prepend_special ? static_cast<WordIdType>(special_tokens.size()) : 0;
|
||||
for (auto word : words) {
|
||||
word2id[word] = word_id++;
|
||||
}
|
||||
|
||||
word_id = prepend_special ? 0 : word2id.size();
|
||||
|
||||
for (auto special_token : special_tokens) {
|
||||
word2id[special_token] = word_id++;
|
||||
}
|
||||
|
||||
*vocab = std::make_shared<Vocab>(std::move(word2id));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Vocab::BuildFromFileCpp(const std::string &path, const std::string &delimiter, int32_t vocab_size,
|
||||
const std::vector<WordType> &special_tokens, bool prepend_special,
|
||||
std::shared_ptr<Vocab> *vocab) {
|
||||
// Validate parameters
|
||||
if (vocab_size < 0 && vocab_size != -1) {
|
||||
MS_LOG(ERROR) << "vocab_size shoule be either -1 or positive integer, but got " << vocab_size;
|
||||
RETURN_STATUS_UNEXPECTED("vocab_size shoule be either -1 or positive integer, but got " +
|
||||
std::to_string(vocab_size));
|
||||
}
|
||||
|
||||
std::string duplicate_sp;
|
||||
for (const WordType &sp : special_tokens) {
|
||||
if (std::count(special_tokens.begin(), special_tokens.end(), sp) > 1) {
|
||||
if (duplicate_sp.find(sp) == std::string::npos) {
|
||||
duplicate_sp = duplicate_sp + ", " + sp;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!duplicate_sp.empty()) {
|
||||
MS_LOG(ERROR) << "special_tokens contains duplicate word: " << duplicate_sp;
|
||||
RETURN_STATUS_UNEXPECTED("special_tokens contains duplicate word: " + duplicate_sp);
|
||||
}
|
||||
|
||||
std::unordered_set<std::string> specials;
|
||||
// used to check that words in file don't contain any special token that already exists
|
||||
for (auto word : special_tokens) {
|
||||
specials.insert(word);
|
||||
}
|
||||
WordIdType word_id = prepend_special ? static_cast<WordIdType>(special_tokens.size()) : 0;
|
||||
std::unordered_map<WordType, WordIdType> word2id;
|
||||
std::fstream handle(path, std::ios::in);
|
||||
if (!handle.good() || !handle.is_open()) {
|
||||
MS_LOG(ERROR) << "fail to open:" + path;
|
||||
RETURN_STATUS_UNEXPECTED("fail to open:" + path);
|
||||
}
|
||||
std::string word;
|
||||
while (std::getline(handle, word)) {
|
||||
if (!delimiter.empty()) {
|
||||
// if delimiter is not found, find_first_of would return std::string::npos which is -1
|
||||
word = word.substr(0, word.find_first_of(delimiter));
|
||||
}
|
||||
if (word2id.find(word) != word2id.end()) {
|
||||
MS_LOG(ERROR) << "duplicate word:" + word + ".";
|
||||
RETURN_STATUS_UNEXPECTED("duplicate word:" + word + ".");
|
||||
}
|
||||
if (specials.find(word) != specials.end()) {
|
||||
MS_LOG(ERROR) << word + " is already in special_tokens.";
|
||||
RETURN_STATUS_UNEXPECTED(word + " is already in special_tokens.");
|
||||
}
|
||||
word2id[word] = word_id++;
|
||||
// break if enough row is read, if vocab_size is smaller than 0
|
||||
if (word2id.size() == vocab_size) break;
|
||||
}
|
||||
|
||||
word_id = prepend_special ? 0 : word2id.size();
|
||||
|
||||
for (auto special_token : special_tokens) {
|
||||
word2id[special_token] = word_id++;
|
||||
}
|
||||
|
||||
*vocab = std::make_shared<Vocab>(std::move(word2id));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Vocab::BuildFromFile(const std::string &path, const std::string &delimiter, int32_t vocab_size,
|
||||
const py::list &special_tokens, bool prepend_special, std::shared_ptr<Vocab> *vocab) {
|
||||
// python validator checks special_tokens doesn't contain any duplicate words
|
||||
|
@ -86,21 +229,6 @@ Status Vocab::BuildFromFile(const std::string &path, const std::string &delimite
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Vocab::BuildFromPyDict(const py::dict &words, std::shared_ptr<Vocab> *vocab) {
|
||||
std::unordered_map<WordType, WordIdType> word2id;
|
||||
for (auto p : words) {
|
||||
word2id[py::str(p.first)] = py::reinterpret_borrow<py::int_>(p.second);
|
||||
}
|
||||
*vocab = std::make_shared<Vocab>(std::move(word2id));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void Vocab::append_word(const std::string &word) {
|
||||
if (word2id_.find(word) == word2id_.end()) {
|
||||
word2id_[word] = word2id_.size();
|
||||
}
|
||||
}
|
||||
|
||||
const WordIdType Vocab::kNoTokenExists = -1;
|
||||
|
||||
} // namespace dataset
|
||||
|
|
|
@ -57,6 +57,34 @@ class Vocab {
|
|||
static Status BuildFromFile(const std::string &path, const std::string &delimiter, int32_t vocab_size,
|
||||
const py::list &special_tokens, bool prepend_special, std::shared_ptr<Vocab> *vocab);
|
||||
|
||||
/// \brief Build a vocab from a c++ map. id needs to start from 2, no duplicate and continuous
|
||||
/// \param[in] words An unordered_map containing word, word id pair.
|
||||
/// \param[out] vocab A vocab object
|
||||
/// \return Error code
|
||||
static Status BuildFromUnorderedMap(const std::unordered_map<WordType, WordIdType> &words,
|
||||
std::shared_ptr<Vocab> *vocab);
|
||||
|
||||
/// \brief Build a vocab from a c++ vector. id needs to start from 2, no duplicate and continuous
|
||||
/// \param[in] words A vector of string, used to build vocab, id starts from 2
|
||||
/// \param[in] special_tokens A vector of string contain special tokens
|
||||
/// \param[in] prepend_special Whether special_tokens will be prepended/appended to vocab
|
||||
/// \param[out] vocab A vocab object
|
||||
/// \return Error code
|
||||
static Status BuildFromVector(const std::vector<WordType> &words, const std::vector<WordType> &special_tokens,
|
||||
bool prepend_special, std::shared_ptr<Vocab> *vocab);
|
||||
|
||||
/// \brief Build a vocab from reading a vocab file, id are automatically assigned, start from 2
|
||||
/// \param[in] path Path to vocab file , each line is assumed to contain 1 word
|
||||
/// \param[in] delimiter Delimiter to break each line with
|
||||
/// \param[in] vocab_size Number of words to read from file
|
||||
/// \param[in] special_tokens A vector of string contain special tokens
|
||||
/// \param[in] prepend_special Whether special_tokens will be prepended/appended to vocab
|
||||
/// \param[out] vocab A vocab object
|
||||
/// \return Error code
|
||||
static Status BuildFromFileCpp(const std::string &path, const std::string &delimiter, int32_t vocab_size,
|
||||
const std::vector<WordType> &special_tokens, bool prepend_special,
|
||||
std::shared_ptr<Vocab> *vocab);
|
||||
|
||||
// Lookup the id of a word, if word doesn't exist in vocab, return default_id
|
||||
// @param const WordType word - word to look up
|
||||
// @param WordIdType default_id - word id to return to user when its not in the vocab
|
||||
|
|
|
@ -97,6 +97,7 @@ SET(DE_UT_SRCS
|
|||
concatenate_op_test.cc
|
||||
cyclic_array_test.cc
|
||||
perf_data_test.cc
|
||||
build_vocab_test.cc
|
||||
c_api_samplers_test.cc
|
||||
c_api_transforms_test.cc
|
||||
c_api_dataset_ops_test.cc
|
||||
|
@ -104,12 +105,13 @@ SET(DE_UT_SRCS
|
|||
c_api_dataset_clue_test.cc
|
||||
c_api_dataset_coco_test.cc
|
||||
c_api_dataset_csv_test.cc
|
||||
c_api_dataset_filetext_test.cc
|
||||
c_api_dataset_textfile_test.cc
|
||||
c_api_dataset_manifest_test.cc
|
||||
c_api_dataset_randomdata_test.cc
|
||||
c_api_dataset_voc_test.cc
|
||||
c_api_datasets_test.cc
|
||||
c_api_dataset_iterator_test.cc
|
||||
c_api_dataset_vocab.cc
|
||||
tensor_op_fusion_pass_test.cc
|
||||
sliding_window_op_test.cc
|
||||
epoch_ctrl_op_test.cc
|
||||
|
|
|
@ -0,0 +1,229 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include "common/common.h"
|
||||
#include "minddata/dataset/include/datasets.h"
|
||||
#include "minddata/dataset/include/status.h"
|
||||
|
||||
using mindspore::dataset::Tensor;
|
||||
using mindspore::dataset::Status;
|
||||
using mindspore::dataset::Vocab;
|
||||
|
||||
class MindDataTestVocab : public UT::DatasetOpTesting {
|
||||
protected:
|
||||
};
|
||||
|
||||
TEST_F(MindDataTestVocab, TestVocabFromUnorderedMap) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromUnorderedMap.";
|
||||
// Build a map
|
||||
std::unordered_map<std::string, int32_t> dict;
|
||||
dict["banana"] = 0;
|
||||
dict["apple"] = 1;
|
||||
dict["cat"] = 2;
|
||||
dict["dog"] = 3;
|
||||
|
||||
// Build vocab from map
|
||||
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
|
||||
Status s = Vocab::BuildFromUnorderedMap(dict, &vocab);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
// Look up specified words
|
||||
std::vector<std::string> words = {"apple", "dog", "egg"};
|
||||
std::vector<int32_t> expected = {1, 3, -1};
|
||||
for (uint32_t i = 0; i < words.size(); ++i) {
|
||||
int32_t x = vocab->Lookup(words[i]);
|
||||
EXPECT_EQ(x, expected[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestVocab, TestVocabFromEmptyMap) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromEmptyMap.";
|
||||
// Build vocab from empty map
|
||||
std::unordered_map<std::string, int32_t> dict;
|
||||
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
|
||||
Status s = Vocab::BuildFromUnorderedMap(dict, &vocab);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
// Look up specified words
|
||||
// Expect that we will return -1 when word is not in vocab
|
||||
std::vector<std::string> words = {"apple", "dog", "egg"};
|
||||
std::vector<int32_t> expected = {-1, -1, -1};
|
||||
for (uint32_t i = 0; i < words.size(); ++i) {
|
||||
int32_t x = vocab->Lookup(words[i]);
|
||||
EXPECT_EQ(x, expected[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestVocab, TestVocabFromMapFail) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromMapFail.";
|
||||
// Build a map
|
||||
std::unordered_map<std::string, int32_t> dict;
|
||||
dict["banana"] = 0;
|
||||
dict["apple"] = -1;
|
||||
|
||||
// Expected failure: index of word can not be negative
|
||||
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
|
||||
Status s = Vocab::BuildFromUnorderedMap(dict, &vocab);
|
||||
EXPECT_NE(s, Status::OK());
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestVocab, TestVocabFromVectorPrependSpTokens) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorPrependSpTokens.";
|
||||
// Build vocab from a vector of words, special tokens are prepended to vocab
|
||||
std::vector<std::string> list = {"apple", "banana", "cat", "dog", "egg"};
|
||||
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
|
||||
Status s = Vocab::BuildFromVector(list, {"<unk>"}, true, &vocab);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
// Look up specified words
|
||||
// Expect that we will return -1 when word is not in vocab
|
||||
std::vector<std::string> words = {"apple", "banana", "fox"};
|
||||
std::vector<int32_t> expected = {1, 2, -1};
|
||||
for (uint32_t i = 0; i < words.size(); ++i) {
|
||||
int32_t x = vocab->Lookup(words[i]);
|
||||
EXPECT_EQ(x, expected[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestVocab, TestVocabFromVectorAppendSpTokens) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorAppendSpTokens.";
|
||||
// Build vocab from a vector of words, special tokens are appended to vocab
|
||||
std::vector<std::string> list = {"apple", "banana", "cat", "dog", "egg"};
|
||||
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
|
||||
Status s = Vocab::BuildFromVector(list, {"<unk>"}, false, &vocab);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
// Look up specified words
|
||||
std::vector<std::string> words = {"apple", "<unk>", "fox"};
|
||||
std::vector<int32_t> expected = {0, 5, -1};
|
||||
for (uint32_t i = 0; i < words.size(); ++i) {
|
||||
int32_t x = vocab->Lookup(words[i]);
|
||||
EXPECT_EQ(x, expected[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestVocab, TestVocabFromVectorWithNoSpTokens) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorWithNoSpTokens.";
|
||||
// Build vocab from a vector of words with no special tokens
|
||||
std::vector<std::string> list = {"apple", "banana", "cat", "dog", "egg"};
|
||||
std::vector<std::string> sp_tokens = {};
|
||||
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
|
||||
Status s = Vocab::BuildFromVector(list, sp_tokens, true, &vocab);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
// Look up specified words
|
||||
std::vector<std::string> words = {"apple", "banana", "fox", "<pad>"};
|
||||
std::vector<int32_t> expected = {0, 1, -1, -1};
|
||||
for (uint32_t i = 0; i < words.size(); ++i) {
|
||||
int32_t x = vocab->Lookup(words[i]);
|
||||
EXPECT_EQ(x, expected[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestVocab, TestVocabFromEmptyVector) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromEmptyVector.";
|
||||
// Build vocab from empty vector
|
||||
std::vector<std::string> list = {};
|
||||
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
|
||||
Status s = Vocab::BuildFromVector(list, {}, false, &vocab);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
// Look up specified words
|
||||
// Expect that we will return -1 when word is not in vocab
|
||||
std::vector<std::string> words = {"apple", "banana", "fox"};
|
||||
std::vector<int32_t> expected = {-1, -1, -1};
|
||||
for (uint32_t i = 0; i < words.size(); ++i) {
|
||||
int32_t x = vocab->Lookup(words[i]);
|
||||
EXPECT_EQ(x, expected[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestVocab, TestVocabFromVectorFail1) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorFail1.";
|
||||
// Build vocab from a vector of words with no special tokens
|
||||
std::vector<std::string> list = {"apple", "apple", "cat", "cat", "egg"};
|
||||
std::vector<std::string> sp_tokens = {};
|
||||
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
|
||||
|
||||
// Expected failure: duplicate word apple
|
||||
Status s = Vocab::BuildFromVector(list, sp_tokens, true, &vocab);
|
||||
EXPECT_NE(s, Status::OK());
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestVocab, TestVocabFromVectorFail2) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorFail2.";
|
||||
// Build vocab from a vector of words with no special tokens
|
||||
std::vector<std::string> list = {"apple", "dog", "egg"};
|
||||
std::vector<std::string> sp_tokens = {"<pad>", "<unk>", "<pad>", "<unk>", "<none>"};
|
||||
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
|
||||
|
||||
// Expected failure: duplicate special token <pad> <unk>
|
||||
Status s = Vocab::BuildFromVector(list, sp_tokens, true, &vocab);
|
||||
EXPECT_NE(s, Status::OK());
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestVocab, TestVocabFromFile) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromFile.";
|
||||
// Build vocab from local file
|
||||
std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt";
|
||||
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
|
||||
Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {"<pad>", "<unk>"}, true, &vocab);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
// Look up specified words
|
||||
std::vector<std::string> words = {"not", "all"};
|
||||
std::vector<int32_t> expected = {2, 3};
|
||||
for (uint32_t i = 0; i < words.size(); ++i) {
|
||||
int32_t x = vocab->Lookup(words[i]);
|
||||
EXPECT_EQ(x, expected[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestVocab, TestVocabFromFileFail1) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromFileFail1.";
|
||||
// Build vocab from local file which is not exist
|
||||
std::string vocab_dir = datasets_root_path_ + "/testVocab/not_exist.txt";
|
||||
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
|
||||
Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {}, true, &vocab);
|
||||
EXPECT_NE(s, Status::OK());
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestVocab, TestVocabFromFileFail2) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromFileFail2.";
|
||||
// Build vocab from local file
|
||||
std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt";
|
||||
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
|
||||
|
||||
// Expected failure: vocab_size shoule be either -1 or positive integer
|
||||
Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -2, {}, true, &vocab);
|
||||
EXPECT_NE(s, Status::OK());
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestVocab, TestVocabFromFileFail3) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromFileFail2.";
|
||||
// Build vocab from local file which is not exist
|
||||
std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt";
|
||||
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
|
||||
|
||||
// Expected failure: duplicate special token <unk>
|
||||
Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {"<unk>", "<unk>"}, true, &vocab);
|
||||
EXPECT_NE(s, Status::OK());
|
||||
}
|
|
@ -14,7 +14,6 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
#include "common/common.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/voc_op.h"
|
||||
#include "minddata/dataset/include/datasets.h"
|
||||
|
||||
using namespace mindspore::dataset::api;
|
||||
|
|
|
@ -0,0 +1,254 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include "common/common.h"
|
||||
#include "minddata/dataset/include/datasets.h"
|
||||
#include "minddata/dataset/include/status.h"
|
||||
#include "minddata/dataset/include/transforms.h"
|
||||
#include "minddata/dataset/include/text.h"
|
||||
|
||||
using namespace mindspore::dataset::api;
|
||||
using mindspore::dataset::ShuffleMode;
|
||||
using mindspore::dataset::Tensor;
|
||||
using mindspore::dataset::Status;
|
||||
using mindspore::dataset::Vocab;
|
||||
|
||||
class MindDataTestPipeline : public UT::DatasetOpTesting {
|
||||
protected:
|
||||
};
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestVocabLookupOp) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupOp.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testVocab/words.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create a vocab from vector
|
||||
std::vector<std::string> list = {"home", "IS", "behind", "the", "world", "ahead", "!"};
|
||||
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
|
||||
Status s = Vocab::BuildFromVector(list, {"<pad>", "<unk>"}, true, &vocab);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
// Create Lookup operation on ds
|
||||
std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "<unk>");
|
||||
EXPECT_NE(lookup, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({lookup}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
uint64_t i = 0;
|
||||
std::vector<int32_t> expected = {2, 1, 4, 5, 6, 7};
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
MS_LOG(INFO) << ind->shape() << " " << *ind;
|
||||
std::shared_ptr<Tensor> expected_item;
|
||||
Tensor::CreateScalar(expected[i], &expected_item);
|
||||
EXPECT_EQ(*ind, *expected_item);
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestVocabLookupOpFail1) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupOpFail1.";
|
||||
// Create a TextFile Dataset
|
||||
std::string data_file = datasets_root_path_ + "/testVocab/words.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Build vocab from vector
|
||||
std::vector<std::string> list = {"home", "IS", "behind", "the", "world", "ahead", "!"};
|
||||
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
|
||||
Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
// Create lookup op for ds
|
||||
// Expected failure: "<unk>" is not a word of vocab
|
||||
std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "<unk>");
|
||||
EXPECT_EQ(lookup, nullptr);
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestVocabLookupOpFail2) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupOpFail2.";
|
||||
// Vocab has nothing
|
||||
std::shared_ptr<Vocab> vocab;
|
||||
|
||||
// Create lookup op
|
||||
// Expected failure: vocab is null
|
||||
std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "");
|
||||
EXPECT_EQ(lookup, nullptr);
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestVocabLookupOpWithEmptyUnknownToken) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupOpWithEmptyUnknownToken.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testVocab/words.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create a vocab from map
|
||||
std::unordered_map<std::string, int32_t> dict;
|
||||
dict["Home"] = 3;
|
||||
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
|
||||
Status s = Vocab::BuildFromUnorderedMap(dict, &vocab);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
// Create Lookup operation on ds
|
||||
// Expected failure: "" is not a word of vocab
|
||||
std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "");
|
||||
EXPECT_EQ(lookup, nullptr);
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestVocabFromDataset) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabFromDataset.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testVocab/words.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create vocab from dataset
|
||||
std::shared_ptr<Vocab> vocab = ds->BuildVocab({"text"}, {0, std::numeric_limits<int64_t>::max()},
|
||||
std::numeric_limits<int64_t>::max(), {"<pad>", "<unk>"}, true);
|
||||
EXPECT_NE(vocab, nullptr);
|
||||
|
||||
// Check if vocab has words or not
|
||||
int32_t home_index = vocab->Lookup("home");
|
||||
EXPECT_EQ(home_index, 4);
|
||||
|
||||
// Create Lookup operation on ds
|
||||
std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "<unk>");
|
||||
EXPECT_NE(lookup, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({lookup}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
uint64_t i = 0;
|
||||
std::vector<int32_t> expected = {4, 5, 3, 6, 7, 2};
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
MS_LOG(INFO) << ind->shape() << " " << *ind;
|
||||
std::shared_ptr<Tensor> expected_item;
|
||||
Tensor::CreateScalar(expected[i], &expected_item);
|
||||
EXPECT_EQ(*ind, *expected_item);
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestVocabFromDatasetDefault) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabFromDatasetDefault.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testVocab/words.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create vocab from dataset
|
||||
std::shared_ptr<Vocab> vocab = ds->BuildVocab();
|
||||
EXPECT_NE(vocab, nullptr);
|
||||
|
||||
// Check if vocab has words or not
|
||||
int32_t home_index = vocab->Lookup("home");
|
||||
EXPECT_EQ(home_index, 2);
|
||||
|
||||
// Create Lookup operation on ds
|
||||
std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "home");
|
||||
EXPECT_NE(lookup, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({lookup});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
uint64_t i = 0;
|
||||
std::vector<int32_t> expected = {2, 3, 1, 4, 5, 0};
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
MS_LOG(INFO) << ind->shape() << " " << *ind;
|
||||
std::shared_ptr<Tensor> expected_item;
|
||||
Tensor::CreateScalar(expected[i], &expected_item);
|
||||
EXPECT_EQ(*ind, *expected_item);
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestVocabFromDatasetFail1) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabFromDatasetFail1.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testVocab/words.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create vocab from dataset
|
||||
// Expected failure: top_k can not be negative
|
||||
std::shared_ptr<Vocab> vocab = ds->BuildVocab({"text"}, {0, std::numeric_limits<int64_t>::max()},
|
||||
-2, {"<pad>", "<unk>"}, true);
|
||||
EXPECT_EQ(vocab, nullptr);
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestVocabFromDatasetFail2) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabFromDatasetFail2.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testVocab/words.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create vocab from dataset
|
||||
// Expected failure: requency_range [a,b] should be 0 <= a <= b
|
||||
std::shared_ptr<Vocab> vocab = ds->BuildVocab({"text"}, {4, 1},
|
||||
std::numeric_limits<int64_t>::max(), {"<pad>", "<unk>"}, true);
|
||||
EXPECT_EQ(vocab, nullptr);
|
||||
}
|
Loading…
Reference in New Issue