forked from mindspore-Ecosystem/mindspore
!30982 Vocab C++ Interface Alignment and SentencePieceVocab C++ interface alignment and Python interface refactoring
Merge pull request !30982 from 刘勇琪/master-vocab-sentencepiecevocab
This commit is contained in:
commit
872cb74d3f
|
@ -38,8 +38,7 @@
|
|||
#include "minddata/dataset/util/status.h"
|
||||
#ifndef ENABLE_ANDROID
|
||||
#include "minddata/dataset/engine/ir/cache/dataset_cache_impl.h"
|
||||
#include "minddata/dataset/text/sentence_piece_vocab.h"
|
||||
#include "minddata/dataset/text/vocab.h"
|
||||
#include "minddata/dataset/include/dataset/text.h"
|
||||
#endif
|
||||
|
||||
// Sampler headers (in alphabetical order)
|
||||
|
|
|
@ -23,7 +23,6 @@
|
|||
#include "minddata/dataset/core/data_type.h"
|
||||
#include "minddata/dataset/engine/serdes.h"
|
||||
#include "minddata/dataset/include/dataset/constants.h"
|
||||
#include "minddata/dataset/text/sentence_piece_vocab.h"
|
||||
#include "minddata/dataset/util/path.h"
|
||||
|
||||
// IR non-leaf nodes
|
||||
|
|
|
@ -19,12 +19,11 @@
|
|||
|
||||
#include "minddata/dataset/api/python/pybind_register.h"
|
||||
#include "minddata/dataset/include/dataset/constants.h"
|
||||
#include "minddata/dataset/include/dataset/text.h"
|
||||
#include "minddata/dataset/text/char_n_gram.h"
|
||||
#include "minddata/dataset/text/fast_text.h"
|
||||
#include "minddata/dataset/text/glove.h"
|
||||
#include "minddata/dataset/text/sentence_piece_vocab.h"
|
||||
#include "minddata/dataset/text/vectors.h"
|
||||
#include "minddata/dataset/text/vocab.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
@ -32,28 +31,29 @@ PYBIND_REGISTER(Vocab, 0, ([](const py::module *m) {
|
|||
(void)py::class_<Vocab, std::shared_ptr<Vocab>>(*m, "Vocab")
|
||||
.def(py::init<>())
|
||||
.def_static("from_list",
|
||||
[](const py::list &words, const py::list &special_tokens, bool special_first) {
|
||||
[](const std::vector<std::string> &words,
|
||||
const std::vector<std::string> &special_tokens, bool special_first) {
|
||||
std::shared_ptr<Vocab> v;
|
||||
THROW_IF_ERROR(Vocab::BuildFromPyList(words, special_tokens, special_first, &v));
|
||||
THROW_IF_ERROR(Vocab::BuildFromVector(words, special_tokens, special_first, &v));
|
||||
return v;
|
||||
})
|
||||
.def_static(
|
||||
"from_file",
|
||||
[](const std::string &path, const std::string &dlm, int32_t vocab_size,
|
||||
const py::list &special_tokens, bool special_first) {
|
||||
const std::vector<std::string> &special_tokens, bool special_first) {
|
||||
std::shared_ptr<Vocab> v;
|
||||
THROW_IF_ERROR(Vocab::BuildFromFile(path, dlm, vocab_size, special_tokens, special_first, &v));
|
||||
return v;
|
||||
})
|
||||
.def_static("from_dict",
|
||||
[](const py::dict &words) {
|
||||
[](const std::unordered_map<WordType, WordIdType> &words) {
|
||||
std::shared_ptr<Vocab> v;
|
||||
THROW_IF_ERROR(Vocab::BuildFromPyDict(words, &v));
|
||||
THROW_IF_ERROR(Vocab::BuildFromUnorderedMap(words, &v));
|
||||
return v;
|
||||
})
|
||||
.def("tokens_to_ids",
|
||||
[](Vocab &self, const std::vector<std::string> words) {
|
||||
auto ids = self.Lookup(words);
|
||||
auto ids = self.TokensToIds(words);
|
||||
py::object ret;
|
||||
if (ids.size() == 1) {
|
||||
ret = py::int_(ids[0]);
|
||||
|
@ -65,7 +65,7 @@ PYBIND_REGISTER(Vocab, 0, ([](const py::module *m) {
|
|||
})
|
||||
.def("ids_to_tokens",
|
||||
[](Vocab &self, const std::vector<int32_t> ids) {
|
||||
auto words = self.ReverseLookup(ids);
|
||||
auto words = self.IdsToTokens(ids);
|
||||
py::object ret;
|
||||
if (words.size() == 1) {
|
||||
ret = py::str(words[0]);
|
||||
|
@ -75,31 +75,19 @@ PYBIND_REGISTER(Vocab, 0, ([](const py::module *m) {
|
|||
}
|
||||
return ret;
|
||||
})
|
||||
.def("vocab", [](Vocab &self) { return self.vocab(); });
|
||||
.def("vocab", [](Vocab &self) { return self.GetVocab(); });
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(SentencePieceVocab, 0, ([](const py::module *m) {
|
||||
(void)py::class_<SentencePieceVocab, std::shared_ptr<SentencePieceVocab>>(*m, "SentencePieceVocab")
|
||||
.def(py::init<>())
|
||||
.def_static("from_file",
|
||||
[](const py::list &paths, const int32_t vocab_size, const float character_coverage,
|
||||
const SentencePieceModel model_type, const py::dict ¶ms) {
|
||||
[](const std::vector<std::string> &paths, const int32_t vocab_size,
|
||||
const float character_coverage, const SentencePieceModel model_type,
|
||||
const std::unordered_map<std::string, std::string> ¶ms) {
|
||||
std::shared_ptr<SentencePieceVocab> v;
|
||||
std::vector<std::string> path_list;
|
||||
for (auto path : paths) {
|
||||
path_list.emplace_back(py::str(path));
|
||||
}
|
||||
std::unordered_map<std::string, std::string> param_map;
|
||||
for (auto param : params) {
|
||||
std::string key = py::reinterpret_borrow<py::str>(param.first);
|
||||
if (key == "input" || key == "vocab_size" || key == "model_prefix" ||
|
||||
key == "character_coverage" || key == "model_type") {
|
||||
continue;
|
||||
}
|
||||
param_map[key] = py::reinterpret_borrow<py::str>(param.second);
|
||||
}
|
||||
THROW_IF_ERROR(SentencePieceVocab::BuildFromFile(
|
||||
path_list, vocab_size, character_coverage, model_type, param_map, &v));
|
||||
paths, vocab_size, character_coverage, model_type, params, &v));
|
||||
return v;
|
||||
})
|
||||
.def_static("save_model", [](const std::shared_ptr<SentencePieceVocab> *vocab, std::string path,
|
||||
|
|
|
@ -14,13 +14,12 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "minddata/dataset/api/python/pybind_register.h"
|
||||
#include "minddata/dataset/include/dataset/text.h"
|
||||
#include "minddata/dataset/text/ir/kernels/text_ir.h"
|
||||
#include "minddata/dataset/text/vectors.h"
|
||||
#include "pybind11/pybind11.h"
|
||||
#include "pybind11/stl_bind.h"
|
||||
#include "minddata/dataset/api/python/pybind_register.h"
|
||||
#include "minddata/dataset/text/ir/kernels/text_ir.h"
|
||||
#include "minddata/dataset/text/sentence_piece_vocab.h"
|
||||
#include "minddata/dataset/text/vectors.h"
|
||||
#include "minddata/dataset/text/vocab.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
|
|
@ -24,7 +24,7 @@
|
|||
#include <vector>
|
||||
|
||||
#include "minddata/dataset/engine/tree_adapter.h"
|
||||
#include "minddata/dataset/text/vocab.h"
|
||||
#include "minddata/dataset/include/dataset/text.h"
|
||||
|
||||
namespace mindspore::dataset {
|
||||
// Forward declare
|
||||
|
|
|
@ -28,9 +28,9 @@
|
|||
#include "minddata/dataset/core/tensor.h"
|
||||
#include "minddata/dataset/engine/dataset_iterator.h"
|
||||
#include "minddata/dataset/engine/datasetops/pipeline_op.h"
|
||||
#include "minddata/dataset/include/dataset/text.h"
|
||||
#include "minddata/dataset/util/status.h"
|
||||
#include "minddata/dataset/util/queue.h"
|
||||
#include "minddata/dataset/text/sentence_piece_vocab.h"
|
||||
#include "pybind11/pybind11.h"
|
||||
|
||||
namespace mindspore {
|
||||
|
@ -54,7 +54,7 @@ class BuildSentencePieceVocabOp : public PipelineOp {
|
|||
BuildSentencePieceVocabOp *s_p_vocab_ptr_;
|
||||
};
|
||||
|
||||
BuildSentencePieceVocabOp(std::shared_ptr<SentencePieceVocab> vocab, std::vector<std::string> col_names,
|
||||
BuildSentencePieceVocabOp(std::shared_ptr<dataset::SentencePieceVocab> vocab, std::vector<std::string> col_names,
|
||||
int32_t vocab_size, float character_coverage, SentencePieceModel model_type,
|
||||
const std::unordered_map<std::string, std::string> ¶ms, int32_t op_conn_size);
|
||||
|
||||
|
|
|
@ -179,15 +179,15 @@ Status BuildVocabOp::CollectorThread() {
|
|||
});
|
||||
|
||||
if (special_first_) {
|
||||
for (const std::string &sp_tk : special_tokens_) vocab_->append_word(sp_tk);
|
||||
for (const std::string &sp_tk : special_tokens_) vocab_->AppendWord(sp_tk);
|
||||
}
|
||||
|
||||
for (int64_t i = 0; i < num_words; i++) {
|
||||
vocab_->append_word(words[i]);
|
||||
vocab_->AppendWord(words[i]);
|
||||
}
|
||||
|
||||
if (!special_first_) {
|
||||
for (const std::string &sp_tk : special_tokens_) vocab_->append_word(sp_tk);
|
||||
for (const std::string &sp_tk : special_tokens_) vocab_->AppendWord(sp_tk);
|
||||
}
|
||||
|
||||
RETURN_IF_NOT_OK(out_connector_->SendEOE());
|
||||
|
|
|
@ -25,7 +25,7 @@
|
|||
#include "minddata/dataset/core/tensor.h"
|
||||
#include "minddata/dataset/engine/dataset_iterator.h"
|
||||
#include "minddata/dataset/engine/datasetops/parallel_op.h"
|
||||
#include "minddata/dataset/text/vocab.h"
|
||||
#include "minddata/dataset/include/dataset/text.h"
|
||||
#include "minddata/dataset/util/queue.h"
|
||||
#include "minddata/dataset/util/status.h"
|
||||
|
||||
|
@ -33,9 +33,9 @@ namespace mindspore {
|
|||
namespace dataset {
|
||||
class BuildVocabOp : public ParallelOp<TensorRow, TensorRow> {
|
||||
public:
|
||||
BuildVocabOp(std::shared_ptr<Vocab> vocab, std::vector<std::string> col_names, std::pair<int64_t, int64_t> freq_range,
|
||||
int64_t top_k, const std::vector<std::string> &tokens, bool prepend, int32_t num_workers,
|
||||
int32_t op_connector_size);
|
||||
BuildVocabOp(std::shared_ptr<dataset::Vocab> vocab, std::vector<std::string> col_names,
|
||||
std::pair<int64_t, int64_t> freq_range, int64_t top_k, const std::vector<std::string> &tokens,
|
||||
bool prepend, int32_t num_workers, int32_t op_connector_size);
|
||||
|
||||
~BuildVocabOp() = default;
|
||||
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
#include <memory>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
|
@ -30,10 +31,204 @@
|
|||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
class SentencePieceVocab;
|
||||
class TensorOperation;
|
||||
class Vectors;
|
||||
class Vocab;
|
||||
|
||||
using WordIdType = int32_t;
|
||||
using WordType = std::string;
|
||||
|
||||
/// \brief Vocab object that is used to save pairs of words and ids.
|
||||
/// \note It contains a map that maps each word(str) to an id(int) or reverse.
|
||||
class Vocab {
|
||||
public:
|
||||
/// \brief Build a vocab from an unordered_map. IDs should be no duplicate and continuous.
|
||||
/// \param[in] words An unordered_map containing word id pair.
|
||||
/// \param[out] vocab A vocab object.
|
||||
/// \return Status code.
|
||||
/// \par Example
|
||||
/// \code
|
||||
/// // Build a map
|
||||
/// std::unordered_map<std::string, int32_t> dict;
|
||||
/// dict["banana"] = 0;
|
||||
/// dict["apple"] = 1;
|
||||
/// dict["cat"] = 2;
|
||||
/// dict["dog"] = 3;
|
||||
/// // Build vocab from map
|
||||
/// std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
|
||||
/// Status s = Vocab::BuildFromUnorderedMap(dict, &vocab);
|
||||
/// \endcode
|
||||
static Status BuildFromUnorderedMap(const std::unordered_map<WordType, WordIdType> &words,
|
||||
std::shared_ptr<Vocab> *vocab);
|
||||
|
||||
/// \brief Build a vocab from a c++ vector. id no duplicate and continuous.
|
||||
/// \param[in] words A vector of string containing words.
|
||||
/// \param[in] special_tokens A vector of string containing special tokens.
|
||||
/// \param[in] prepend_special Whether the special_tokens will be prepended/appended to vocab.
|
||||
/// \param[out] vocab A vocab object.
|
||||
/// \return Status code.
|
||||
/// \par Example
|
||||
/// \code
|
||||
/// // Build vocab from a vector of words, special tokens are prepended to vocab
|
||||
/// std::vector<std::string> list = {"apple", "banana", "cat", "dog", "egg"};
|
||||
/// std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
|
||||
/// Status s = Vocab::BuildFromVector(list, {"<unk>"}, true, &vocab);
|
||||
/// \endcode
|
||||
static Status BuildFromVector(const std::vector<WordType> &words, const std::vector<WordType> &special_tokens,
|
||||
bool prepend_special, std::shared_ptr<Vocab> *vocab);
|
||||
|
||||
/// \brief Build a vocab from vocab file, IDs will be automatically assigned.
|
||||
/// \param[in] path Path to vocab file, each line in file is assumed as a word (including space).
|
||||
/// \param[in] delimiter Delimiter to break each line, characters after the delimiter will be deprecated.
|
||||
/// \param[in] vocab_size Number of lines to be read from file.
|
||||
/// \param[in] special_tokens A vector of string containing special tokens.
|
||||
/// \param[in] prepend_special Whether the special_tokens will be prepended/appended to vocab.
|
||||
/// \param[out] vocab A vocab object.
|
||||
/// \return Status code.
|
||||
/// \par Example
|
||||
/// \code
|
||||
/// // Build vocab from local file
|
||||
/// std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt";
|
||||
/// std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
|
||||
/// Status s = Vocab::BuildFromFile(vocab_dir, ",", -1, {"<pad>", "<unk>"}, true, &vocab);
|
||||
/// \endcode
|
||||
static Status BuildFromFile(const std::string &path, const std::string &delimiter, int32_t vocab_size,
|
||||
const std::vector<WordType> &special_tokens, bool prepend_special,
|
||||
std::shared_ptr<Vocab> *vocab);
|
||||
|
||||
/// Lookup the id of a word, if the word doesn't exist in vocab, return -1.
|
||||
/// \param word Word to be looked up.
|
||||
/// \return ID of the word in the vocab.
|
||||
/// \par Example
|
||||
/// \code
|
||||
/// // lookup, convert token to id
|
||||
/// auto single_index = vocab->TokensToIds("home");
|
||||
/// single_index = vocab->TokensToIds("hello");
|
||||
/// \endcode
|
||||
WordIdType TokensToIds(const WordType &word) const;
|
||||
|
||||
/// Lookup the id of a word, if the word doesn't exist in vocab, return -1.
|
||||
/// \param words Words to be looked up.
|
||||
/// \return ID of the word in the vocab.
|
||||
/// \par Example
|
||||
/// \code
|
||||
/// // lookup multiple tokens
|
||||
/// auto multi_indexs = vocab->TokensToIds(std::vector<std::string>{"<pad>", "behind"});
|
||||
/// std::vector<int32_t> expected_multi_indexs = {0, 4};
|
||||
/// multi_indexs = vocab->TokensToIds(std::vector<std::string>{"<pad>", "apple"});
|
||||
/// expected_multi_indexs = {0, -1};
|
||||
/// \endcode
|
||||
std::vector<WordIdType> TokensToIds(const std::vector<WordType> &words) const;
|
||||
|
||||
/// Lookup the word of an ID, if ID doesn't exist in vocab, return empty string.
|
||||
/// \param id ID to be looked up.
|
||||
/// \return Indicates the word corresponding to the ID.
|
||||
/// \par Example
|
||||
/// \code
|
||||
/// // reverse lookup, convert id to token
|
||||
/// auto single_word = vocab->IdsToTokens(2);
|
||||
/// single_word = vocab->IdsToTokens(-1);
|
||||
/// \endcode
|
||||
WordType IdsToTokens(const WordIdType &id);
|
||||
|
||||
/// Lookup the word of an ID, if ID doesn't exist in vocab, return empty string.
|
||||
/// \param ids ID to be looked up.
|
||||
/// \return Indicates the word corresponding to the ID.
|
||||
/// \par Example
|
||||
/// \code
|
||||
/// // reverse lookup multiple ids
|
||||
/// auto multi_words = vocab->IdsToTokens(std::vector<int32_t>{0, 4});
|
||||
/// std::vector<std::string> expected_multi_words = {"<pad>", "behind"};
|
||||
/// multi_words = vocab->IdsToTokens(std::vector<int32_t>{0, 99});
|
||||
/// expected_multi_words = {"<pad>", ""};
|
||||
/// \endcode
|
||||
std::vector<WordType> IdsToTokens(const std::vector<WordIdType> &ids);
|
||||
|
||||
/// Constructor, shouldn't be called directly, can't be private due to std::make_unique().
|
||||
/// \param map Sanitized word2id map.
|
||||
explicit Vocab(std::unordered_map<WordType, WordIdType> map);
|
||||
|
||||
/// \brief Add one word to vocab, increment it's index automatically.
|
||||
/// \param word Word to be added, word will skip if word already exists.
|
||||
void AppendWord(const std::string &word);
|
||||
|
||||
/// \brief Return a read-only vocab in unordered_map type.
|
||||
/// \return A unordered_map of word2id.
|
||||
const std::unordered_map<WordType, WordIdType> &GetVocab() { return word2id_; }
|
||||
|
||||
/// \brief Constructor.
|
||||
Vocab() = default;
|
||||
|
||||
/// \brief Destructor.
|
||||
~Vocab() = default;
|
||||
|
||||
static const WordIdType kNoTokenExists;
|
||||
static const WordType kNoIdExists;
|
||||
|
||||
private:
|
||||
std::unordered_map<WordType, WordIdType> word2id_;
|
||||
std::unordered_map<WordIdType, WordType> id2word_;
|
||||
};
|
||||
|
||||
/// \brief SentencePiece object that is used to do words segmentation.
|
||||
class SentencePieceVocab {
|
||||
public:
|
||||
/// \brief Build a SentencePiece object from a file.
|
||||
/// \param[in] path_list Path to the file which contains the SentencePiece list.
|
||||
/// \param[in] vocab_size Vocabulary size.
|
||||
/// \param[in] character_coverage Amount of characters covered by the model, good defaults are: 0.9995 for
|
||||
/// languages with rich character set like Japanese or Chinese and 1.0 for other languages with small
|
||||
/// character set.
|
||||
/// \param[in] model_type It can be any of [SentencePieceModel.UNIGRAM, SentencePieceModel.BPE,
|
||||
/// SentencePieceModel.CHAR, SentencePieceModel.WORD], default is SentencePieceModel.UNIGRAM. The input
|
||||
/// sentence must be pre-tokenized when using SentencePieceModel.WORD type.
|
||||
/// - SentencePieceModel.UNIGRAM, Unigram Language Model means the next word in the sentence is assumed
|
||||
/// to be independent of the previous words generated by the model.
|
||||
/// - SentencePieceModel.BPE, refers to byte pair encoding algorithm, which replaces the most frequent
|
||||
/// pair of bytes in a sentence with a single, unused byte.
|
||||
/// - SentencePieceModel.CHAR, refers to char based sentencePiece Model type.
|
||||
/// - SentencePieceModel.WORD, refers to word based sentencePiece Model type.
|
||||
/// \param[in] params A dictionary with no incoming parameters(The parameters are derived from SentencePiece library).
|
||||
/// \return SentencePieceVocab, vocab built from the file.
|
||||
/// \par Example
|
||||
/// \code
|
||||
/// std::string dataset_path;
|
||||
/// dataset_path = datasets_root_path_ + "/test_sentencepiece/botchan.txt";
|
||||
/// std::vector<std::string> path_list;
|
||||
/// path_list.emplace_back(dataset_path);
|
||||
/// std::unordered_map<std::string, std::string> param_map;
|
||||
/// std::shared_ptr<SentencePieceVocab> spm = std::make_unique<SentencePieceVocab>();
|
||||
/// Status rc = SentencePieceVocab::BuildFromFile(path_list, 5000, 0.9995,
|
||||
/// SentencePieceModel::kUnigram, param_map, &spm);
|
||||
/// \endcode
|
||||
static Status BuildFromFile(const std::vector<std::string> &path_list, const int32_t vocab_size,
|
||||
const float character_coverage, const SentencePieceModel model_type,
|
||||
const std::unordered_map<std::string, std::string> ¶ms,
|
||||
std::shared_ptr<SentencePieceVocab> *vocab);
|
||||
|
||||
/// \brief Save the SentencePiece model into given file path.
|
||||
/// \param[in] vocab A SentencePiece object to be saved.
|
||||
/// \param[in] path Path to store the model.
|
||||
/// \param[in] filename The save name of model file.
|
||||
/// \par Example
|
||||
/// \code
|
||||
/// // Save vocab model to local
|
||||
/// vocab->SaveModel(&vocab, datasets_root_path_ + "/test_sentencepiece", "m.model");
|
||||
/// \endcode
|
||||
static Status SaveModel(const std::shared_ptr<SentencePieceVocab> *vocab, std::string path, std::string filename);
|
||||
|
||||
/// \brief Constructor.
|
||||
SentencePieceVocab();
|
||||
|
||||
/// \brief Destructor.
|
||||
~SentencePieceVocab() = default;
|
||||
|
||||
const std::string &model_proto();
|
||||
|
||||
void set_model_proto(const std::string model_proto);
|
||||
|
||||
private:
|
||||
std::string model_proto_;
|
||||
};
|
||||
|
||||
// Transform operations for text
|
||||
namespace text {
|
||||
|
@ -414,7 +609,7 @@ class MS_API NormalizeUTF8 final : public TensorTransform {
|
|||
/// \brief Constructor.
|
||||
/// \param[in] normalize_form Valid values can be any of [NormalizeForm::kNone,NormalizeForm::kNfc,
|
||||
/// NormalizeForm::kNfkc, NormalizeForm::kNfd, NormalizeForm::kNfkd](default=NormalizeForm::kNfkc).
|
||||
/// See http://unicode.org/reports/tr15/ for details.
|
||||
/// See <http://unicode.org/reports/tr15/> for details.
|
||||
/// - NormalizeForm.kNone, remain the input string tensor unchanged.
|
||||
/// - NormalizeForm.kNfc, normalizes with Normalization Form C.
|
||||
/// - NormalizeForm.kNfkc, normalizes with Normalization Form KC.
|
||||
|
|
|
@ -217,7 +217,7 @@ Status LookupOperation::ValidateParams() {
|
|||
LOG_AND_RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
if (unknown_token_ != std::nullopt) {
|
||||
default_id_ = vocab_->Lookup(*unknown_token_);
|
||||
default_id_ = vocab_->TokensToIds(*unknown_token_);
|
||||
if (default_id_ == Vocab::kNoTokenExists) {
|
||||
std::string err_msg = "Lookup: \"" + *unknown_token_ + "\" doesn't exist in vocab.";
|
||||
LOG_AND_RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
|
|
|
@ -30,7 +30,7 @@ Status LookupOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<T
|
|||
std::vector<WordIdType> word_ids;
|
||||
word_ids.reserve(input->Size());
|
||||
for (auto itr = input->begin<std::string_view>(); itr != input->end<std::string_view>(); ++itr) {
|
||||
WordIdType word_id = vocab_->Lookup(std::string(*itr));
|
||||
WordIdType word_id = vocab_->TokensToIds(std::string(*itr));
|
||||
word_ids.emplace_back(word_id == Vocab::kNoTokenExists ? default_id_ : word_id);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(word_ids.back() != Vocab::kNoTokenExists,
|
||||
"Lookup: invalid data, token: \"" + std::string(*itr) +
|
||||
|
|
|
@ -23,9 +23,9 @@
|
|||
#include <vector>
|
||||
|
||||
#include "minddata/dataset/core/tensor.h"
|
||||
#include "minddata/dataset/include/dataset/text.h"
|
||||
#include "minddata/dataset/kernels/tensor_op.h"
|
||||
#include "minddata/dataset/util/status.h"
|
||||
#include "minddata/dataset/text/vocab.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
|
|
@ -24,10 +24,10 @@
|
|||
#include <memory>
|
||||
|
||||
#include "minddata/dataset/include/dataset/constants.h"
|
||||
#include "minddata/dataset/include/dataset/text.h"
|
||||
#include "minddata/dataset/kernels/tensor_op.h"
|
||||
#include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h"
|
||||
#include "minddata/dataset/util/status.h"
|
||||
#include "minddata/dataset/text/sentence_piece_vocab.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
|
|
@ -46,7 +46,7 @@ Status WordpieceTokenizerOp::LookupWord(const std::string &input_token, const Ru
|
|||
if (start > 0) {
|
||||
word = suffix_indicator_ + word;
|
||||
}
|
||||
if (vocab_->Lookup(word) != Vocab::kNoTokenExists) {
|
||||
if (vocab_->TokensToIds(word) != Vocab::kNoTokenExists) {
|
||||
*out_found = true;
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -23,9 +23,9 @@
|
|||
#include "cppjieba/Unicode.hpp"
|
||||
|
||||
#include "minddata/dataset/core/tensor.h"
|
||||
#include "minddata/dataset/include/dataset/text.h"
|
||||
#include "minddata/dataset/kernels/tensor_op.h"
|
||||
#include "minddata/dataset/text/kernels/tokenizer_op.h"
|
||||
#include "minddata/dataset/text/vocab.h"
|
||||
#include "minddata/dataset/util/status.h"
|
||||
|
||||
using cppjieba::DecodeRunesInString;
|
||||
|
|
|
@ -14,16 +14,18 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "minddata/dataset/text/sentence_piece_vocab.h"
|
||||
|
||||
#include <sentencepiece_trainer.h>
|
||||
#include <sentencepiece_processor.h>
|
||||
#include <sentencepiece_trainer.h>
|
||||
|
||||
#include <fstream>
|
||||
|
||||
#include "include/common/utils/utils.h"
|
||||
#include "minddata/dataset/include/dataset/constants.h"
|
||||
#include "minddata/dataset/include/dataset/text.h"
|
||||
#include "minddata/dataset/util/path.h"
|
||||
#include "minddata/dataset/util/status.h"
|
||||
#include "utils/file_utils.h"
|
||||
#include "utils/ms_utils.h"
|
||||
#include "include/common/utils/utils.h"
|
||||
#include "minddata/dataset/util/path.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
|
|
@ -1,50 +0,0 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_SENTENCE_PIECE_VOCAB_H_
|
||||
#define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_SENTENCE_PIECE_VOCAB_H_
|
||||
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
#include "minddata/dataset/util/status.h"
|
||||
#include "minddata/dataset/include/dataset/constants.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
class SentencePieceVocab {
|
||||
public:
|
||||
static Status BuildFromFile(const std::vector<std::string> &path_list, const int32_t vocab_size,
|
||||
const float character_coverage, const SentencePieceModel model_type,
|
||||
const std::unordered_map<std::string, std::string> ¶ms,
|
||||
std::shared_ptr<SentencePieceVocab> *vocab);
|
||||
static Status SaveModel(const std::shared_ptr<SentencePieceVocab> *vocab, std::string path, std::string filename);
|
||||
SentencePieceVocab();
|
||||
|
||||
~SentencePieceVocab() = default;
|
||||
|
||||
const std::string &model_proto();
|
||||
|
||||
void set_model_proto(const std::string model_proto);
|
||||
|
||||
private:
|
||||
std::string model_proto_;
|
||||
};
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_SENTENCE_PIECE_VOCAB_H_
|
|
@ -14,14 +14,14 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "minddata/dataset/text/vocab.h"
|
||||
|
||||
#include <fstream>
|
||||
#include <unordered_set>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <algorithm>
|
||||
#include <fstream>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <utility>
|
||||
|
||||
#include "minddata/dataset/include/dataset/text.h"
|
||||
#include "minddata/dataset/util/status.h"
|
||||
#include "utils/file_utils.h"
|
||||
#ifndef ENABLE_ANDROID
|
||||
#include "utils/log_adapter.h"
|
||||
|
@ -33,18 +33,18 @@ namespace mindspore {
|
|||
namespace dataset {
|
||||
Vocab::Vocab(std::unordered_map<WordType, WordIdType> word2id) { word2id_ = std::move(word2id); }
|
||||
|
||||
WordIdType Vocab::Lookup(const WordType &word) const {
|
||||
WordIdType Vocab::TokensToIds(const WordType &word) const {
|
||||
auto itr = word2id_.find(word);
|
||||
return itr == word2id_.end() ? kNoTokenExists : itr->second;
|
||||
}
|
||||
|
||||
std::vector<WordIdType> Vocab::Lookup(const std::vector<WordType> &words) const {
|
||||
std::vector<WordIdType> Vocab::TokensToIds(const std::vector<WordType> &words) const {
|
||||
std::vector<WordIdType> ids;
|
||||
std::transform(words.begin(), words.end(), std::back_inserter(ids), [this](auto w) { return Lookup(w); });
|
||||
std::transform(words.begin(), words.end(), std::back_inserter(ids), [this](auto w) { return TokensToIds(w); });
|
||||
return ids;
|
||||
}
|
||||
|
||||
WordType Vocab::ReverseLookup(const WordIdType &id) {
|
||||
WordType Vocab::IdsToTokens(const WordIdType &id) {
|
||||
// lazy initialization, since I think it's not common use but waste memory
|
||||
if (id2word_.empty()) {
|
||||
for (const auto [word_, id_] : word2id_) {
|
||||
|
@ -55,7 +55,7 @@ WordType Vocab::ReverseLookup(const WordIdType &id) {
|
|||
return itr == id2word_.end() ? kNoIdExists : itr->second;
|
||||
}
|
||||
|
||||
std::vector<WordType> Vocab::ReverseLookup(const std::vector<WordIdType> &ids) {
|
||||
std::vector<WordType> Vocab::IdsToTokens(const std::vector<WordIdType> &ids) {
|
||||
// lazy initialization, since I think it's not common use but waste memory
|
||||
if (id2word_.empty()) {
|
||||
for (const auto [word_, id_] : word2id_) {
|
||||
|
@ -63,50 +63,11 @@ std::vector<WordType> Vocab::ReverseLookup(const std::vector<WordIdType> &ids) {
|
|||
}
|
||||
}
|
||||
std::vector<WordType> words;
|
||||
std::transform(ids.begin(), ids.end(), std::back_inserter(words), [this](auto i) { return ReverseLookup(i); });
|
||||
std::transform(ids.begin(), ids.end(), std::back_inserter(words), [this](auto i) { return IdsToTokens(i); });
|
||||
return words;
|
||||
}
|
||||
|
||||
#ifdef ENABLE_PYTHON
|
||||
Status Vocab::BuildFromPyList(const py::list &words, const py::list &special_tokens, bool prepend_special,
|
||||
std::shared_ptr<Vocab> *vocab) {
|
||||
if (vocab == nullptr) {
|
||||
RETURN_STATUS_UNEXPECTED("Vocab::BuildFromPyList: input vocab can not be null");
|
||||
}
|
||||
// check of duplication on both words and special_tokens will be performed in python
|
||||
// special_tokens and words both need to be unique, and shouldn't overlap
|
||||
std::unordered_map<WordType, WordIdType> word2id;
|
||||
// if special is added in front, normal words id will start from number of special tokens
|
||||
WordIdType word_id = prepend_special ? static_cast<WordIdType>(special_tokens.size()) : 0;
|
||||
|
||||
for (auto word : words) {
|
||||
word2id[py::str(word)] = word_id++;
|
||||
}
|
||||
|
||||
word_id = prepend_special ? 0 : word2id.size();
|
||||
|
||||
for (auto special_token : special_tokens) {
|
||||
word2id[py::str(special_token)] = word_id++;
|
||||
}
|
||||
|
||||
*vocab = std::make_shared<Vocab>(std::move(word2id));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Vocab::BuildFromPyDict(const py::dict &words, std::shared_ptr<Vocab> *vocab) {
|
||||
if (vocab == nullptr) {
|
||||
RETURN_STATUS_UNEXPECTED("Vocab::BuildFromPyDict: input vocab can not be null");
|
||||
}
|
||||
std::unordered_map<WordType, WordIdType> word2id;
|
||||
for (auto p : words) {
|
||||
word2id[py::str(p.first)] = py::reinterpret_borrow<py::int_>(p.second);
|
||||
}
|
||||
*vocab = std::make_shared<Vocab>(std::move(word2id));
|
||||
return Status::OK();
|
||||
}
|
||||
#endif
|
||||
|
||||
void Vocab::append_word(const std::string &word) {
|
||||
void Vocab::AppendWord(const std::string &word) {
|
||||
if (word2id_.find(word) == word2id_.end()) {
|
||||
word2id_[word] = word2id_.size();
|
||||
}
|
||||
|
@ -161,11 +122,11 @@ Status Vocab::BuildFromVector(const std::vector<WordType> &words, const std::vec
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Vocab::BuildFromFileCpp(const std::string &path, const std::string &delimiter, int32_t vocab_size,
|
||||
const std::vector<WordType> &special_tokens, bool prepend_special,
|
||||
std::shared_ptr<Vocab> *vocab) {
|
||||
Status Vocab::BuildFromFile(const std::string &path, const std::string &delimiter, int32_t vocab_size,
|
||||
const std::vector<WordType> &special_tokens, bool prepend_special,
|
||||
std::shared_ptr<Vocab> *vocab) {
|
||||
if (vocab == nullptr) {
|
||||
RETURN_STATUS_UNEXPECTED("Vocab::BuildFromFileCpp: input vocab can not be null");
|
||||
RETURN_STATUS_UNEXPECTED("Vocab::BuildFromFile: input vocab can not be null");
|
||||
}
|
||||
// Validate parameters
|
||||
auto realpath = FileUtils::GetRealPath(path.c_str());
|
||||
|
@ -227,56 +188,6 @@ Status Vocab::BuildFromFileCpp(const std::string &path, const std::string &delim
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Vocab::BuildFromFile(const std::string &path, const std::string &delimiter, int32_t vocab_size,
|
||||
const py::list &special_tokens, bool prepend_special, std::shared_ptr<Vocab> *vocab) {
|
||||
if (vocab == nullptr) {
|
||||
RETURN_STATUS_UNEXPECTED("Vocab::BuildFromFile: input vocab can not be null");
|
||||
}
|
||||
// python validator checks special_tokens doesn't contain any duplicate words
|
||||
std::unordered_set<std::string> specials;
|
||||
// used to check that words in file don't contain any special token that already exists
|
||||
for (auto word : special_tokens) {
|
||||
specials.insert(py::str(word));
|
||||
}
|
||||
WordIdType word_id = prepend_special ? static_cast<WordIdType>(special_tokens.size()) : 0;
|
||||
std::unordered_map<WordType, WordIdType> word2id;
|
||||
|
||||
auto realpath = FileUtils::GetRealPath(path.c_str());
|
||||
if (!realpath.has_value()) {
|
||||
RETURN_STATUS_UNEXPECTED("Get real path failed, path=" + path);
|
||||
}
|
||||
|
||||
std::fstream handle(realpath.value(), std::ios::in);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(handle.good() && handle.is_open(), "from_file: fail to open:" + path);
|
||||
std::string word;
|
||||
while (std::getline(handle, word)) {
|
||||
if (!delimiter.empty()) {
|
||||
// if delimiter is not found, find_first_of would return std::string::npos which is -1
|
||||
word = word.substr(0, word.find_first_of(delimiter));
|
||||
}
|
||||
if (word2id.find(word) != word2id.end()) {
|
||||
handle.close();
|
||||
RETURN_STATUS_UNEXPECTED("from_file: duplicate word:" + word + ".");
|
||||
}
|
||||
if (specials.find(word) != specials.end()) {
|
||||
handle.close();
|
||||
RETURN_STATUS_UNEXPECTED("from_file: special_tokens and word_list contain duplicate word:" + word);
|
||||
}
|
||||
word2id[word] = word_id++;
|
||||
// break if enough row is read, if vocab_size is smaller than 0
|
||||
if (word2id.size() == vocab_size) break;
|
||||
}
|
||||
handle.close();
|
||||
word_id = prepend_special ? 0 : word2id.size();
|
||||
|
||||
for (auto special_token : special_tokens) {
|
||||
word2id[py::str(special_token)] = word_id++;
|
||||
}
|
||||
|
||||
*vocab = std::make_shared<Vocab>(std::move(word2id));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
const WordIdType Vocab::kNoTokenExists = -1;
|
||||
const WordType Vocab::kNoIdExists = std::string();
|
||||
|
||||
|
|
|
@ -1,143 +0,0 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_VOCAB_H_
|
||||
#define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_VOCAB_H_
|
||||
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "minddata/dataset/util/status.h"
|
||||
#ifdef ENABLE_PYTHON
|
||||
#include "pybind11/pybind11.h"
|
||||
#include "pybind11/stl.h"
|
||||
#endif
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
#ifdef ENABLE_PYTHON
|
||||
namespace py = pybind11;
|
||||
#endif
|
||||
|
||||
using WordIdType = int32_t;
|
||||
using WordType = std::string;
|
||||
|
||||
class Vocab {
|
||||
public:
|
||||
#ifdef ENABLE_PYTHON
|
||||
// Build a vocab from a python dictionary key is each word ,id needs to start from 2, no duplicate and continuous
|
||||
// @param const py::dict &words - a dictionary containing word, word id pair.
|
||||
// @param std::shared_ptr<Vocab> *vocab - return value, vocab object
|
||||
// @return error code
|
||||
static Status BuildFromPyDict(const py::dict &words, std::shared_ptr<Vocab> *vocab);
|
||||
|
||||
// Build a vocab from a python list, id will be assigned automatically, start from 2
|
||||
// @param const py::list &words - a list of string, used to build vocab, id starts from 2
|
||||
// @param std::shared_ptr<Vocab> *vocab - return value, vocab object
|
||||
// @return error code
|
||||
static Status BuildFromPyList(const py::list &words, const py::list &special_tokens, bool prepend_special,
|
||||
std::shared_ptr<Vocab> *vocab);
|
||||
|
||||
// Build a vocab from reading a vocab file, id are automatically assigned, start from 2
|
||||
// @param std::string &path - path to vocab file , each line is assumed to contain 1 word
|
||||
// @param std::string &delimiter - delimiter to break each line with
|
||||
// @param int32_t vocab_size - number of words to read from file
|
||||
// @param std::shared_ptr<Vocab> *vocab - return value, vocab object
|
||||
// @return error code
|
||||
static Status BuildFromFile(const std::string &path, const std::string &delimiter, int32_t vocab_size,
|
||||
const py::list &special_tokens, bool prepend_special, std::shared_ptr<Vocab> *vocab);
|
||||
#endif
|
||||
|
||||
/// \brief Build a vocab from a c++ map. id needs to start from 2, no duplicate and continuous
|
||||
/// \param[in] words An unordered_map containing word, word id pair.
|
||||
/// \param[out] vocab A vocab object
|
||||
/// \return Error code
|
||||
static Status BuildFromUnorderedMap(const std::unordered_map<WordType, WordIdType> &words,
|
||||
std::shared_ptr<Vocab> *vocab);
|
||||
|
||||
/// \brief Build a vocab from a c++ vector. id needs to start from 2, no duplicate and continuous
|
||||
/// \param[in] words A vector of string, used to build vocab, id starts from 2
|
||||
/// \param[in] special_tokens A vector of string contain special tokens
|
||||
/// \param[in] prepend_special Whether special_tokens will be prepended/appended to vocab
|
||||
/// \param[out] vocab A vocab object
|
||||
/// \return Error code
|
||||
static Status BuildFromVector(const std::vector<WordType> &words, const std::vector<WordType> &special_tokens,
|
||||
bool prepend_special, std::shared_ptr<Vocab> *vocab);
|
||||
|
||||
/// \brief Build a vocab from reading a vocab file, id are automatically assigned, start from 2
|
||||
/// \param[in] path Path to vocab file , each line is assumed to contain 1 word
|
||||
/// \param[in] delimiter Delimiter to break each line with
|
||||
/// \param[in] vocab_size Number of words to read from file
|
||||
/// \param[in] special_tokens A vector of string contain special tokens
|
||||
/// \param[in] prepend_special Whether special_tokens will be prepended/appended to vocab
|
||||
/// \param[out] vocab A vocab object
|
||||
/// \return Error code
|
||||
static Status BuildFromFileCpp(const std::string &path, const std::string &delimiter, int32_t vocab_size,
|
||||
const std::vector<WordType> &special_tokens, bool prepend_special,
|
||||
std::shared_ptr<Vocab> *vocab);
|
||||
|
||||
// Lookup the id of a word, if word doesn't exist in vocab, return default_id
|
||||
// @param const WordType word - word to look up
|
||||
// @param WordIdType default_id - word id to return to user when its not in the vocab
|
||||
// @return WordIdType, word_id
|
||||
WordIdType Lookup(const WordType &word) const;
|
||||
|
||||
// Lookup the ids of a vector of words, if word doesn't exist in vocab, return default_id
|
||||
// @param const WordType word - word to look up
|
||||
// @param WordIdType default_id - word id to return to user when its not in the vocab
|
||||
// @return WordIdType, word_id
|
||||
std::vector<WordIdType> Lookup(const std::vector<WordType> &words) const;
|
||||
|
||||
// Find the word of a id, if word doesn't exist in vocab, return empty string
|
||||
// @param const WordIdType id - id to reverse look up
|
||||
// @return WordType, word
|
||||
WordType ReverseLookup(const WordIdType &id);
|
||||
|
||||
// Find the words of a vector of ids, if word doesn't exist in vocab, return empty string
|
||||
// @param const WordIdType id - id to reverse look up
|
||||
// @return WordType, word
|
||||
std::vector<WordType> ReverseLookup(const std::vector<WordIdType> &ids);
|
||||
|
||||
// constructor, shouldn't be called directly, can't be private due to std::make_unique()
|
||||
// @param std::unordered_map<WordType, WordIdType> map - sanitized word2id map
|
||||
explicit Vocab(std::unordered_map<WordType, WordIdType> map);
|
||||
|
||||
Vocab() = default;
|
||||
|
||||
// add one word to vocab, increment it's index automatically
|
||||
// @param std::string & word - word to be added will skip if word already exists
|
||||
void append_word(const std::string &word);
|
||||
|
||||
// return a read-only vocab
|
||||
const std::unordered_map<WordType, WordIdType> vocab() { return word2id_; }
|
||||
|
||||
// destructor
|
||||
~Vocab() = default;
|
||||
|
||||
static const WordIdType kNoTokenExists;
|
||||
static const WordType kNoIdExists;
|
||||
|
||||
private:
|
||||
std::unordered_map<WordType, WordIdType> word2id_;
|
||||
std::unordered_map<WordIdType, WordType> id2word_;
|
||||
};
|
||||
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_VOCAB_H_
|
|
@ -47,7 +47,7 @@ import numpy as np
|
|||
import mindspore._c_dataengine as cde
|
||||
from mindspore.common import dtype as mstype
|
||||
|
||||
from .utils import JiebaMode, NormalizeForm, to_str, SPieceTokenizerOutType, SPieceTokenizerLoadType
|
||||
from .utils import JiebaMode, NormalizeForm, to_str, SPieceTokenizerOutType, SPieceTokenizerLoadType, SentencePieceVocab
|
||||
from .validators import check_lookup, check_jieba_add_dict, check_to_vectors, \
|
||||
check_jieba_add_word, check_jieba_init, check_with_offsets, check_unicode_script_tokenizer, \
|
||||
check_wordpiece_tokenizer, check_regex_replace, check_regex_tokenizer, check_basic_tokenizer, check_ngram, \
|
||||
|
@ -386,6 +386,7 @@ class SentencePieceTokenizer(TextTensorOperation):
|
|||
self.out_type = out_type
|
||||
|
||||
def parse(self):
|
||||
self.mode = self.mode.c_sentence_piece_vocab if isinstance(self.mode, SentencePieceVocab) else self.mode
|
||||
return cde.SentencePieceTokenizerOperation(self.mode, DE_C_INTER_SENTENCEPIECE_OUTTYPE[self.out_type])
|
||||
|
||||
|
||||
|
|
|
@ -141,7 +141,7 @@ class Vocab:
|
|||
>>> dataset = dataset.map(operations=text.Lookup(vocab, "<unk>"), input_columns=["text"])
|
||||
"""
|
||||
|
||||
vocab = Vocab()
|
||||
vocab = cls()
|
||||
vocab.c_vocab = dataset.build_vocab(columns, freq_range, top_k, special_tokens, special_first)
|
||||
return vocab
|
||||
|
||||
|
@ -211,7 +211,7 @@ class Vocab:
|
|||
vocab_size = -1
|
||||
if special_tokens is None:
|
||||
special_tokens = []
|
||||
vocab = Vocab()
|
||||
vocab = cls()
|
||||
vocab.c_vocab = cde.Vocab.from_file(file_path, delimiter, vocab_size, special_tokens, special_first)
|
||||
return vocab
|
||||
|
||||
|
@ -232,16 +232,19 @@ class Vocab:
|
|||
>>> vocab = text.Vocab.from_dict({"home": 3, "behind": 2, "the": 4, "world": 5, "<unk>": 6})
|
||||
"""
|
||||
|
||||
vocab = Vocab()
|
||||
vocab = cls()
|
||||
vocab.c_vocab = cde.Vocab.from_dict(word_dict)
|
||||
return vocab
|
||||
|
||||
|
||||
class SentencePieceVocab(cde.SentencePieceVocab):
|
||||
class SentencePieceVocab:
|
||||
"""
|
||||
SentencePiece object that is used to do words segmentation.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.c_sentence_piece_vocab = None
|
||||
|
||||
@classmethod
|
||||
@check_from_dataset_sentencepiece
|
||||
def from_dataset(cls, dataset, col_names, vocab_size, character_coverage, model_type, params):
|
||||
|
@ -278,8 +281,11 @@ class SentencePieceVocab(cde.SentencePieceVocab):
|
|||
... SentencePieceModel.UNIGRAM, {})
|
||||
"""
|
||||
|
||||
return dataset.build_sentencepiece_vocab(col_names, vocab_size, character_coverage,
|
||||
model_type, params)
|
||||
sentence_piece_vocab = cls()
|
||||
sentence_piece_vocab.c_sentence_piece_vocab = dataset.build_sentencepiece_vocab(col_names, vocab_size,
|
||||
character_coverage,
|
||||
model_type, params)
|
||||
return sentence_piece_vocab
|
||||
|
||||
@classmethod
|
||||
@check_from_file_sentencepiece
|
||||
|
@ -321,8 +327,11 @@ class SentencePieceVocab(cde.SentencePieceVocab):
|
|||
... SentencePieceModel.UNIGRAM, {})
|
||||
"""
|
||||
|
||||
return super().from_file(file_path, vocab_size, character_coverage,
|
||||
DE_C_INTER_SENTENCEPIECE_MODE[model_type], params)
|
||||
sentence_piece_vocab = cls()
|
||||
sentence_piece_vocab.c_sentence_piece_vocab = \
|
||||
cde.SentencePieceVocab.from_file(file_path, vocab_size, character_coverage,
|
||||
DE_C_INTER_SENTENCEPIECE_MODE[model_type], params)
|
||||
return sentence_piece_vocab
|
||||
|
||||
@classmethod
|
||||
@check_save_model
|
||||
|
@ -342,7 +351,7 @@ class SentencePieceVocab(cde.SentencePieceVocab):
|
|||
>>> text.SentencePieceVocab.save_model(vocab, "./", "m.model")
|
||||
"""
|
||||
|
||||
super().save_model(vocab, path, filename)
|
||||
cde.SentencePieceVocab.save_model(vocab.c_sentence_piece_vocab, path, filename)
|
||||
|
||||
|
||||
def to_str(array, encoding='utf8'):
|
||||
|
|
|
@ -551,7 +551,7 @@ def check_save_model(method):
|
|||
[vocab, path, filename], _ = parse_user_args(method, *args, **kwargs)
|
||||
|
||||
if vocab is not None:
|
||||
type_check(vocab, (cde.SentencePieceVocab,), "vocab")
|
||||
type_check(vocab, (text.SentencePieceVocab,), "vocab")
|
||||
|
||||
if path is not None:
|
||||
type_check(path, (str,), "path")
|
||||
|
@ -573,7 +573,7 @@ def check_sentence_piece_tokenizer(method):
|
|||
def new_method(self, *args, **kwargs):
|
||||
[mode, out_type], _ = parse_user_args(method, *args, **kwargs)
|
||||
|
||||
type_check(mode, (str, cde.SentencePieceVocab), "mode is not an instance of str or cde.SentencePieceVocab.")
|
||||
type_check(mode, (str, text.SentencePieceVocab), "mode is not an instance of str or text.SentencePieceVocab.")
|
||||
type_check(out_type, (SPieceTokenizerOutType,), "out_type is not an instance of SPieceTokenizerOutType")
|
||||
|
||||
return method(self, *args, **kwargs)
|
||||
|
|
|
@ -20,7 +20,7 @@
|
|||
|
||||
#include "common/common.h"
|
||||
#include "include/api/status.h"
|
||||
#include "minddata/dataset/text/vocab.h"
|
||||
#include "minddata/dataset/include/dataset/text.h"
|
||||
|
||||
using mindspore::dataset::Tensor;
|
||||
using mindspore::dataset::Vocab;
|
||||
|
@ -47,7 +47,7 @@ TEST_F(MindDataTestVocab, TestVocabFromUnorderedMap) {
|
|||
std::vector<std::string> words = {"apple", "dog", "egg"};
|
||||
std::vector<int64_t> expected = {1, 3, -1};
|
||||
for (uint32_t i = 0; i < words.size(); ++i) {
|
||||
int32_t x = vocab->Lookup(words[i]);
|
||||
int32_t x = vocab->TokensToIds(words[i]);
|
||||
EXPECT_EQ(x, expected[i]);
|
||||
}
|
||||
}
|
||||
|
@ -65,7 +65,7 @@ TEST_F(MindDataTestVocab, TestVocabFromEmptyMap) {
|
|||
std::vector<std::string> words = {"apple", "dog", "egg"};
|
||||
std::vector<int64_t> expected = {-1, -1, -1};
|
||||
for (uint32_t i = 0; i < words.size(); ++i) {
|
||||
int32_t x = vocab->Lookup(words[i]);
|
||||
int32_t x = vocab->TokensToIds(words[i]);
|
||||
EXPECT_EQ(x, expected[i]);
|
||||
}
|
||||
}
|
||||
|
@ -96,7 +96,7 @@ TEST_F(MindDataTestVocab, TestVocabFromVectorPrependSpTokens) {
|
|||
std::vector<std::string> words = {"apple", "banana", "fox"};
|
||||
std::vector<int64_t> expected = {1, 2, -1};
|
||||
for (uint32_t i = 0; i < words.size(); ++i) {
|
||||
int32_t x = vocab->Lookup(words[i]);
|
||||
int32_t x = vocab->TokensToIds(words[i]);
|
||||
EXPECT_EQ(x, expected[i]);
|
||||
}
|
||||
}
|
||||
|
@ -113,7 +113,7 @@ TEST_F(MindDataTestVocab, TestVocabFromVectorAppendSpTokens) {
|
|||
std::vector<std::string> words = {"apple", "<unk>", "fox"};
|
||||
std::vector<int64_t> expected = {0, 5, -1};
|
||||
for (uint32_t i = 0; i < words.size(); ++i) {
|
||||
int32_t x = vocab->Lookup(words[i]);
|
||||
int32_t x = vocab->TokensToIds(words[i]);
|
||||
EXPECT_EQ(x, expected[i]);
|
||||
}
|
||||
}
|
||||
|
@ -131,7 +131,7 @@ TEST_F(MindDataTestVocab, TestVocabFromVectorWithNoSpTokens) {
|
|||
std::vector<std::string> words = {"apple", "banana", "fox", "<pad>"};
|
||||
std::vector<int64_t> expected = {0, 1, -1, -1};
|
||||
for (uint32_t i = 0; i < words.size(); ++i) {
|
||||
int32_t x = vocab->Lookup(words[i]);
|
||||
int32_t x = vocab->TokensToIds(words[i]);
|
||||
EXPECT_EQ(x, expected[i]);
|
||||
}
|
||||
}
|
||||
|
@ -149,7 +149,7 @@ TEST_F(MindDataTestVocab, TestVocabFromEmptyVector) {
|
|||
std::vector<std::string> words = {"apple", "banana", "fox"};
|
||||
std::vector<int64_t> expected = {-1, -1, -1};
|
||||
for (uint32_t i = 0; i < words.size(); ++i) {
|
||||
int32_t x = vocab->Lookup(words[i]);
|
||||
int32_t x = vocab->TokensToIds(words[i]);
|
||||
EXPECT_EQ(x, expected[i]);
|
||||
}
|
||||
}
|
||||
|
@ -195,14 +195,14 @@ TEST_F(MindDataTestVocab, TestVocabFromFile) {
|
|||
// Build vocab from local file
|
||||
std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt";
|
||||
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
|
||||
Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {"<pad>", "<unk>"}, true, &vocab);
|
||||
Status s = Vocab::BuildFromFile(vocab_dir, ",", -1, {"<pad>", "<unk>"}, true, &vocab);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
// Look up specified words
|
||||
std::vector<std::string> words = {"not", "all"};
|
||||
std::vector<int64_t> expected = {2, 3};
|
||||
for (uint32_t i = 0; i < words.size(); ++i) {
|
||||
int32_t x = vocab->Lookup(words[i]);
|
||||
int32_t x = vocab->TokensToIds(words[i]);
|
||||
EXPECT_EQ(x, expected[i]);
|
||||
}
|
||||
}
|
||||
|
@ -212,7 +212,7 @@ TEST_F(MindDataTestVocab, TestVocabFromFileFail1) {
|
|||
// Build vocab from local file which is not exist
|
||||
std::string vocab_dir = datasets_root_path_ + "/testVocab/not_exist.txt";
|
||||
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
|
||||
Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {}, true, &vocab);
|
||||
Status s = Vocab::BuildFromFile(vocab_dir, ",", -1, {}, true, &vocab);
|
||||
EXPECT_NE(s, Status::OK());
|
||||
}
|
||||
|
||||
|
@ -223,7 +223,7 @@ TEST_F(MindDataTestVocab, TestVocabFromFileFail2) {
|
|||
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
|
||||
|
||||
// Expected failure: vocab_size should be either -1 or positive integer
|
||||
Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -2, {}, true, &vocab);
|
||||
Status s = Vocab::BuildFromFile(vocab_dir, ",", -2, {}, true, &vocab);
|
||||
EXPECT_NE(s, Status::OK());
|
||||
}
|
||||
|
||||
|
@ -234,7 +234,7 @@ TEST_F(MindDataTestVocab, TestVocabFromFileFail3) {
|
|||
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
|
||||
|
||||
// Expected failure: duplicate special token <unk>
|
||||
Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {"<unk>", "<unk>"}, true, &vocab);
|
||||
Status s = Vocab::BuildFromFile(vocab_dir, ",", -1, {"<unk>", "<unk>"}, true, &vocab);
|
||||
EXPECT_NE(s, Status::OK());
|
||||
}
|
||||
|
||||
|
@ -245,6 +245,6 @@ TEST_F(MindDataTestVocab, TestVocabFromFileFail4) {
|
|||
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
|
||||
|
||||
// Expected failure: special_tokens and word_list contain duplicate word
|
||||
Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {"home"}, true, &vocab);
|
||||
Status s = Vocab::BuildFromFile(vocab_dir, ",", -1, {"home"}, true, &vocab);
|
||||
EXPECT_NE(s, Status::OK());
|
||||
}
|
||||
|
|
|
@ -23,7 +23,6 @@
|
|||
#include "minddata/dataset/include/dataset/datasets.h"
|
||||
#include "minddata/dataset/include/dataset/text.h"
|
||||
#include "minddata/dataset/include/dataset/transforms.h"
|
||||
#include "minddata/dataset/text/sentence_piece_vocab.h"
|
||||
|
||||
using namespace mindspore::dataset;
|
||||
using mindspore::dataset::SentencePieceModel;
|
||||
|
|
|
@ -27,7 +27,6 @@
|
|||
#include "minddata/dataset/text/fast_text.h"
|
||||
#include "minddata/dataset/text/glove.h"
|
||||
#include "minddata/dataset/text/vectors.h"
|
||||
#include "minddata/dataset/text/vocab.h"
|
||||
|
||||
using namespace mindspore::dataset;
|
||||
using mindspore::Status;
|
||||
|
@ -797,7 +796,7 @@ TEST_F(MindDataTestPipeline, TestFilterWikipediaXMLSuccess) {
|
|||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, mindspore::MSTensor> row;
|
||||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
std::vector<std::string> expected = {"welcome to beijing","",""};
|
||||
std::vector<std::string> expected = {"welcome to beijing", "", ""};
|
||||
|
||||
uint64_t i = 0;
|
||||
|
||||
|
@ -806,7 +805,7 @@ TEST_F(MindDataTestPipeline, TestFilterWikipediaXMLSuccess) {
|
|||
std::shared_ptr<Tensor> de_expected_tensor;
|
||||
ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
|
||||
mindspore::MSTensor ms_expected_tensor =
|
||||
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
|
||||
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
|
||||
EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
|
||||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
i++;
|
||||
|
@ -1709,8 +1708,7 @@ TEST_F(MindDataTestPipeline, TestToNumberFail1) {
|
|||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create ToNumber operation on ds
|
||||
std::shared_ptr<TensorTransform> to_number =
|
||||
std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt8);
|
||||
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt8);
|
||||
EXPECT_NE(to_number, nullptr);
|
||||
|
||||
// Create a Map operation on ds
|
||||
|
@ -1760,7 +1758,8 @@ TEST_F(MindDataTestPipeline, TestToNumberFail2) {
|
|||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create ToNumber operation on ds
|
||||
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeFloat16);
|
||||
std::shared_ptr<TensorTransform> to_number =
|
||||
std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeFloat16);
|
||||
EXPECT_NE(to_number, nullptr);
|
||||
|
||||
// Create a Map operation on ds
|
||||
|
@ -2143,8 +2142,7 @@ TEST_F(MindDataTestPipeline, TestNgramSuccess1) {
|
|||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
|
||||
std::vector<std::vector<std::string>> expected = {
|
||||
{"&-This", "This-is", "is-a", "a-text", "text-file.", "file.-&", "&-&-This", "&-This-is", "This-is-a",
|
||||
"is-a-text",
|
||||
{"&-This", "This-is", "is-a", "a-text", "text-file.", "file.-&", "&-&-This", "&-This-is", "This-is-a", "is-a-text",
|
||||
"a-text-file.", "text-file.-&", "file.-&-&"},
|
||||
{"&-Be", "Be-happy", "happy-every", "every-day.", "day.-&", "&-&-Be", "&-Be-happy", "Be-happy-every",
|
||||
"happy-every-day.", "every-day.-&", "day.-&-&"},
|
||||
|
@ -4371,8 +4369,7 @@ TEST_F(MindDataTestPipeline, TestGloVeDefaultParam) {
|
|||
Status s = GloVe::BuildFromFile(&glove, vectors_dir);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
std::shared_ptr<TensorTransform> lookup =
|
||||
std::make_shared<text::ToVectors>(glove);
|
||||
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(glove);
|
||||
EXPECT_NE(lookup, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
|
@ -4388,14 +4385,13 @@ TEST_F(MindDataTestPipeline, TestGloVeDefaultParam) {
|
|||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
|
||||
uint64_t i = 0;
|
||||
std::vector<std::vector<float>> expected = {
|
||||
{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
|
||||
{0, 0, 0, 0, 0, 0},
|
||||
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
|
||||
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
|
||||
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
|
||||
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
|
||||
{0, 0, 0, 0, 0, 0}};
|
||||
std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
|
||||
{0, 0, 0, 0, 0, 0},
|
||||
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
|
||||
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
|
||||
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
|
||||
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
|
||||
{0, 0, 0, 0, 0, 0}};
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
MS_LOG(INFO) << ind.Shape();
|
||||
|
@ -4434,8 +4430,7 @@ TEST_F(MindDataTestPipeline, TestGloVeAllBuildfromfileParams) {
|
|||
Status s = GloVe::BuildFromFile(&glove, vectors_dir, 100);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
std::shared_ptr<TensorTransform> lookup =
|
||||
std::make_shared<text::ToVectors>(glove);
|
||||
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(glove);
|
||||
EXPECT_NE(lookup, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
|
@ -4451,14 +4446,13 @@ TEST_F(MindDataTestPipeline, TestGloVeAllBuildfromfileParams) {
|
|||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
|
||||
uint64_t i = 0;
|
||||
std::vector<std::vector<float>> expected = {
|
||||
{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
|
||||
{0, 0, 0, 0, 0, 0},
|
||||
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
|
||||
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
|
||||
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
|
||||
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
|
||||
{0, 0, 0, 0, 0, 0}};
|
||||
std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
|
||||
{0, 0, 0, 0, 0, 0},
|
||||
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
|
||||
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
|
||||
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
|
||||
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
|
||||
{0, 0, 0, 0, 0, 0}};
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
MS_LOG(INFO) << ind.Shape();
|
||||
|
@ -4498,8 +4492,7 @@ TEST_F(MindDataTestPipeline, TestGloVeUnknownInit) {
|
|||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
|
||||
std::shared_ptr<TensorTransform> lookup =
|
||||
std::make_shared<text::ToVectors>(glove, unknown_init);
|
||||
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(glove, unknown_init);
|
||||
EXPECT_NE(lookup, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
|
@ -4515,14 +4508,13 @@ TEST_F(MindDataTestPipeline, TestGloVeUnknownInit) {
|
|||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
|
||||
uint64_t i = 0;
|
||||
std::vector<std::vector<float>> expected = {
|
||||
{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
|
||||
{-1, -1, -1, -1, -1, -1},
|
||||
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
|
||||
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
|
||||
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
|
||||
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
|
||||
{-1, -1, -1, -1, -1, -1}};
|
||||
std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
|
||||
{-1, -1, -1, -1, -1, -1},
|
||||
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
|
||||
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
|
||||
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
|
||||
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
|
||||
{-1, -1, -1, -1, -1, -1}};
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
MS_LOG(INFO) << ind.Shape();
|
||||
|
@ -4562,8 +4554,7 @@ TEST_F(MindDataTestPipeline, TestGloVeAllParams) {
|
|||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
|
||||
std::shared_ptr<TensorTransform> lookup =
|
||||
std::make_shared<text::ToVectors>(glove, unknown_init, true);
|
||||
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(glove, unknown_init, true);
|
||||
EXPECT_NE(lookup, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
|
@ -4579,14 +4570,13 @@ TEST_F(MindDataTestPipeline, TestGloVeAllParams) {
|
|||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
|
||||
uint64_t i = 0;
|
||||
std::vector<std::vector<float>> expected = {
|
||||
{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
|
||||
{-1, -1, -1, -1, -1, -1},
|
||||
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
|
||||
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
|
||||
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
|
||||
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
|
||||
{-1, -1, -1, -1, -1, -1}};
|
||||
std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
|
||||
{-1, -1, -1, -1, -1, -1},
|
||||
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
|
||||
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
|
||||
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
|
||||
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
|
||||
{-1, -1, -1, -1, -1, -1}};
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
MS_LOG(INFO) << ind.Shape();
|
||||
|
@ -4748,13 +4738,13 @@ TEST_F(MindDataTestPipeline, TestCharNGramDefaultParam) {
|
|||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
|
||||
uint64_t i = 0;
|
||||
std::vector<std::vector<float>> expected = {{0,0,0,0,0},
|
||||
{0,0,0,0,0},
|
||||
{0.117336,0.362446,-0.983326,0.939264,-0.05648},
|
||||
{0.657201,2.11761,-1.59276,0.432072,1.21395},
|
||||
{0,0,0,0,0},
|
||||
{-2.26956,0.288491,-0.740001,0.661703,0.147355},
|
||||
{0,0,0,0,0}};
|
||||
std::vector<std::vector<float>> expected = {{0, 0, 0, 0, 0},
|
||||
{0, 0, 0, 0, 0},
|
||||
{0.117336, 0.362446, -0.983326, 0.939264, -0.05648},
|
||||
{0.657201, 2.11761, -1.59276, 0.432072, 1.21395},
|
||||
{0, 0, 0, 0, 0},
|
||||
{-2.26956, 0.288491, -0.740001, 0.661703, 0.147355},
|
||||
{0, 0, 0, 0, 0}};
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
MS_LOG(INFO) << ind.Shape();
|
||||
|
@ -4810,13 +4800,13 @@ TEST_F(MindDataTestPipeline, TestCharNGramAllBuildfromfileParams) {
|
|||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
|
||||
uint64_t i = 0;
|
||||
std::vector<std::vector<float>> expected = {{0,0,0,0,0},
|
||||
{0,0,0,0,0},
|
||||
{-0.155665,0.664073,-0.538499,1.22657,-0.2162},
|
||||
{0.657201,2.11761,-1.59276,0.432072,1.21395},
|
||||
{0,0,0,0,0},
|
||||
{-2.26956,0.288491,-0.740001,0.661703,0.147355},
|
||||
{0,0,0,0,0}};
|
||||
std::vector<std::vector<float>> expected = {{0, 0, 0, 0, 0},
|
||||
{0, 0, 0, 0, 0},
|
||||
{-0.155665, 0.664073, -0.538499, 1.22657, -0.2162},
|
||||
{0.657201, 2.11761, -1.59276, 0.432072, 1.21395},
|
||||
{0, 0, 0, 0, 0},
|
||||
{-2.26956, 0.288491, -0.740001, 0.661703, 0.147355},
|
||||
{0, 0, 0, 0, 0}};
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
MS_LOG(INFO) << ind.Shape();
|
||||
|
@ -4873,13 +4863,13 @@ TEST_F(MindDataTestPipeline, TestCharNGramUnknownInit) {
|
|||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
|
||||
uint64_t i = 0;
|
||||
std::vector<std::vector<float>> expected = {{-1,-1,-1,-1,-1},
|
||||
{-1,-1,-1,-1,-1},
|
||||
{-0.155665,0.664073,-0.538499,1.22657,-0.2162},
|
||||
{0.657201,2.11761,-1.59276,0.432072,1.21395},
|
||||
{-1,-1,-1,-1,-1},
|
||||
{-2.26956,0.288491,-0.740001,0.661703,0.147355},
|
||||
{-1,-1,-1,-1,-1}};
|
||||
std::vector<std::vector<float>> expected = {{-1, -1, -1, -1, -1},
|
||||
{-1, -1, -1, -1, -1},
|
||||
{-0.155665, 0.664073, -0.538499, 1.22657, -0.2162},
|
||||
{0.657201, 2.11761, -1.59276, 0.432072, 1.21395},
|
||||
{-1, -1, -1, -1, -1},
|
||||
{-2.26956, 0.288491, -0.740001, 0.661703, 0.147355},
|
||||
{-1, -1, -1, -1, -1}};
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
MS_LOG(INFO) << ind.Shape();
|
||||
|
@ -4936,13 +4926,13 @@ TEST_F(MindDataTestPipeline, TestCharNGramAllParams) {
|
|||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
|
||||
uint64_t i = 0;
|
||||
std::vector<std::vector<float>> expected = {{-1,-1,-1,-1,-1},
|
||||
{-1,-1,-1,-1,-1},
|
||||
{0.117336,0.362446,-0.983326,0.939264,-0.05648},
|
||||
{0.657201,2.11761,-1.59276,0.432072,1.21395},
|
||||
{-1,-1,-1,-1,-1},
|
||||
{-2.26956,0.288491,-0.740001,0.661703,0.147355},
|
||||
{-1,-1,-1,-1,-1}};
|
||||
std::vector<std::vector<float>> expected = {{-1, -1, -1, -1, -1},
|
||||
{-1, -1, -1, -1, -1},
|
||||
{0.117336, 0.362446, -0.983326, 0.939264, -0.05648},
|
||||
{0.657201, 2.11761, -1.59276, 0.432072, 1.21395},
|
||||
{-1, -1, -1, -1, -1},
|
||||
{-2.26956, 0.288491, -0.740001, 0.661703, 0.147355},
|
||||
{-1, -1, -1, -1, -1}};
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
MS_LOG(INFO) << ind.Shape();
|
||||
|
|
|
@ -22,7 +22,6 @@
|
|||
#include "minddata/dataset/include/dataset/datasets.h"
|
||||
#include "minddata/dataset/include/dataset/text.h"
|
||||
#include "minddata/dataset/include/dataset/transforms.h"
|
||||
#include "minddata/dataset/text/vocab.h"
|
||||
|
||||
using namespace mindspore::dataset;
|
||||
using mindspore::Status;
|
||||
|
@ -42,7 +41,7 @@ class MindDataTestPipeline : public UT::DatasetOpTesting {
|
|||
} while (false)
|
||||
|
||||
/// Feature: C++ text.Vocab class.
|
||||
/// Description: test Lookup() ReverseLookup() methods of text::Vocab.
|
||||
/// Description: test TokensToIds() IdsToTokens() methods of text::Vocab.
|
||||
/// Expectation: success.
|
||||
TEST_F(MindDataTestPipeline, TestVocabLookupAndReverseLookup) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupAndReverseLookup.";
|
||||
|
@ -53,30 +52,30 @@ TEST_F(MindDataTestPipeline, TestVocabLookupAndReverseLookup) {
|
|||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
// lookup, convert token to id
|
||||
auto single_index = vocab->Lookup("home");
|
||||
auto single_index = vocab->TokensToIds("home");
|
||||
EXPECT_EQ(single_index, 2);
|
||||
single_index = vocab->Lookup("hello");
|
||||
single_index = vocab->TokensToIds("hello");
|
||||
EXPECT_EQ(single_index, -1);
|
||||
|
||||
// lookup multiple tokens
|
||||
auto multi_indexs = vocab->Lookup(std::vector<std::string>{"<pad>", "behind"});
|
||||
auto multi_indexs = vocab->TokensToIds(std::vector<std::string>{"<pad>", "behind"});
|
||||
std::vector<int32_t> expected_multi_indexs = {0, 4};
|
||||
EXPECT_EQ(multi_indexs, expected_multi_indexs);
|
||||
multi_indexs = vocab->Lookup(std::vector<std::string>{"<pad>", "apple"});
|
||||
multi_indexs = vocab->TokensToIds(std::vector<std::string>{"<pad>", "apple"});
|
||||
expected_multi_indexs = {0, -1};
|
||||
EXPECT_EQ(multi_indexs, expected_multi_indexs);
|
||||
|
||||
// reverse lookup, convert id to token
|
||||
auto single_word = vocab->ReverseLookup(2);
|
||||
auto single_word = vocab->IdsToTokens(2);
|
||||
EXPECT_EQ(single_word, "home");
|
||||
single_word = vocab->ReverseLookup(-1);
|
||||
single_word = vocab->IdsToTokens(-1);
|
||||
EXPECT_EQ(single_word, "");
|
||||
|
||||
// reverse lookup multiple ids
|
||||
auto multi_words = vocab->ReverseLookup(std::vector<int32_t>{0, 4});
|
||||
auto multi_words = vocab->IdsToTokens(std::vector<int32_t>{0, 4});
|
||||
std::vector<std::string> expected_multi_words = {"<pad>", "behind"};
|
||||
EXPECT_EQ(multi_words, expected_multi_words);
|
||||
multi_words = vocab->ReverseLookup(std::vector<int32_t>{0, 99});
|
||||
multi_words = vocab->IdsToTokens(std::vector<int32_t>{0, 99});
|
||||
expected_multi_words = {"<pad>", ""};
|
||||
EXPECT_EQ(multi_words, expected_multi_words);
|
||||
}
|
||||
|
@ -330,7 +329,7 @@ TEST_F(MindDataTestPipeline, TestVocabFromDataset) {
|
|||
EXPECT_NE(vocab, nullptr);
|
||||
|
||||
// Check if vocab has words or not
|
||||
int32_t home_index = vocab->Lookup("home");
|
||||
int32_t home_index = vocab->TokensToIds("home");
|
||||
EXPECT_EQ(home_index, 4);
|
||||
|
||||
// Create Lookup operation on ds
|
||||
|
@ -386,7 +385,7 @@ TEST_F(MindDataTestPipeline, TestVocabFromDatasetDefault) {
|
|||
EXPECT_NE(vocab, nullptr);
|
||||
|
||||
// Check if vocab has words or not
|
||||
int32_t home_index = vocab->Lookup("home");
|
||||
int32_t home_index = vocab->TokensToIds("home");
|
||||
EXPECT_EQ(home_index, 2);
|
||||
|
||||
// Create Lookup operation on ds
|
||||
|
@ -509,7 +508,7 @@ TEST_F(MindDataTestPipeline, TestVocabFromDatasetInt64) {
|
|||
EXPECT_NE(vocab, nullptr);
|
||||
|
||||
// Check if vocab has words or not
|
||||
int32_t home_index = vocab->Lookup("home");
|
||||
int32_t home_index = vocab->TokensToIds("home");
|
||||
EXPECT_EQ(home_index, 2);
|
||||
|
||||
// Create Lookup operation on ds
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
#include "common/common.h"
|
||||
#include "minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h"
|
||||
#include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
|
||||
#include "minddata/dataset/text/sentence_piece_vocab.h"
|
||||
#include "minddata/dataset/include/dataset/text.h"
|
||||
#include "minddata/dataset/engine/datasetops/source/text_file_op.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "utils/log_adapter.h"
|
||||
|
|
Loading…
Reference in New Issue