!30982 Vocab C++ Interface Alignment and SentencePieceVocab C++ interface alignment and Python interface refactoring

Merge pull request !30982 from 刘勇琪/master-vocab-sentencepiecevocab
This commit is contained in:
i-robot 2022-03-10 07:58:41 +00:00 committed by Gitee
commit 872cb74d3f
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
27 changed files with 371 additions and 473 deletions

View File

@ -38,8 +38,7 @@
#include "minddata/dataset/util/status.h"
#ifndef ENABLE_ANDROID
#include "minddata/dataset/engine/ir/cache/dataset_cache_impl.h"
#include "minddata/dataset/text/sentence_piece_vocab.h"
#include "minddata/dataset/text/vocab.h"
#include "minddata/dataset/include/dataset/text.h"
#endif
// Sampler headers (in alphabetical order)

View File

@ -23,7 +23,6 @@
#include "minddata/dataset/core/data_type.h"
#include "minddata/dataset/engine/serdes.h"
#include "minddata/dataset/include/dataset/constants.h"
#include "minddata/dataset/text/sentence_piece_vocab.h"
#include "minddata/dataset/util/path.h"
// IR non-leaf nodes

View File

@ -19,12 +19,11 @@
#include "minddata/dataset/api/python/pybind_register.h"
#include "minddata/dataset/include/dataset/constants.h"
#include "minddata/dataset/include/dataset/text.h"
#include "minddata/dataset/text/char_n_gram.h"
#include "minddata/dataset/text/fast_text.h"
#include "minddata/dataset/text/glove.h"
#include "minddata/dataset/text/sentence_piece_vocab.h"
#include "minddata/dataset/text/vectors.h"
#include "minddata/dataset/text/vocab.h"
namespace mindspore {
namespace dataset {
@ -32,28 +31,29 @@ PYBIND_REGISTER(Vocab, 0, ([](const py::module *m) {
(void)py::class_<Vocab, std::shared_ptr<Vocab>>(*m, "Vocab")
.def(py::init<>())
.def_static("from_list",
[](const py::list &words, const py::list &special_tokens, bool special_first) {
[](const std::vector<std::string> &words,
const std::vector<std::string> &special_tokens, bool special_first) {
std::shared_ptr<Vocab> v;
THROW_IF_ERROR(Vocab::BuildFromPyList(words, special_tokens, special_first, &v));
THROW_IF_ERROR(Vocab::BuildFromVector(words, special_tokens, special_first, &v));
return v;
})
.def_static(
"from_file",
[](const std::string &path, const std::string &dlm, int32_t vocab_size,
const py::list &special_tokens, bool special_first) {
const std::vector<std::string> &special_tokens, bool special_first) {
std::shared_ptr<Vocab> v;
THROW_IF_ERROR(Vocab::BuildFromFile(path, dlm, vocab_size, special_tokens, special_first, &v));
return v;
})
.def_static("from_dict",
[](const py::dict &words) {
[](const std::unordered_map<WordType, WordIdType> &words) {
std::shared_ptr<Vocab> v;
THROW_IF_ERROR(Vocab::BuildFromPyDict(words, &v));
THROW_IF_ERROR(Vocab::BuildFromUnorderedMap(words, &v));
return v;
})
.def("tokens_to_ids",
[](Vocab &self, const std::vector<std::string> words) {
auto ids = self.Lookup(words);
auto ids = self.TokensToIds(words);
py::object ret;
if (ids.size() == 1) {
ret = py::int_(ids[0]);
@ -65,7 +65,7 @@ PYBIND_REGISTER(Vocab, 0, ([](const py::module *m) {
})
.def("ids_to_tokens",
[](Vocab &self, const std::vector<int32_t> ids) {
auto words = self.ReverseLookup(ids);
auto words = self.IdsToTokens(ids);
py::object ret;
if (words.size() == 1) {
ret = py::str(words[0]);
@ -75,31 +75,19 @@ PYBIND_REGISTER(Vocab, 0, ([](const py::module *m) {
}
return ret;
})
.def("vocab", [](Vocab &self) { return self.vocab(); });
.def("vocab", [](Vocab &self) { return self.GetVocab(); });
}));
PYBIND_REGISTER(SentencePieceVocab, 0, ([](const py::module *m) {
(void)py::class_<SentencePieceVocab, std::shared_ptr<SentencePieceVocab>>(*m, "SentencePieceVocab")
.def(py::init<>())
.def_static("from_file",
[](const py::list &paths, const int32_t vocab_size, const float character_coverage,
const SentencePieceModel model_type, const py::dict &params) {
[](const std::vector<std::string> &paths, const int32_t vocab_size,
const float character_coverage, const SentencePieceModel model_type,
const std::unordered_map<std::string, std::string> &params) {
std::shared_ptr<SentencePieceVocab> v;
std::vector<std::string> path_list;
for (auto path : paths) {
path_list.emplace_back(py::str(path));
}
std::unordered_map<std::string, std::string> param_map;
for (auto param : params) {
std::string key = py::reinterpret_borrow<py::str>(param.first);
if (key == "input" || key == "vocab_size" || key == "model_prefix" ||
key == "character_coverage" || key == "model_type") {
continue;
}
param_map[key] = py::reinterpret_borrow<py::str>(param.second);
}
THROW_IF_ERROR(SentencePieceVocab::BuildFromFile(
path_list, vocab_size, character_coverage, model_type, param_map, &v));
paths, vocab_size, character_coverage, model_type, params, &v));
return v;
})
.def_static("save_model", [](const std::shared_ptr<SentencePieceVocab> *vocab, std::string path,

View File

@ -14,13 +14,12 @@
* limitations under the License.
*/
#include "minddata/dataset/api/python/pybind_register.h"
#include "minddata/dataset/include/dataset/text.h"
#include "minddata/dataset/text/ir/kernels/text_ir.h"
#include "minddata/dataset/text/vectors.h"
#include "pybind11/pybind11.h"
#include "pybind11/stl_bind.h"
#include "minddata/dataset/api/python/pybind_register.h"
#include "minddata/dataset/text/ir/kernels/text_ir.h"
#include "minddata/dataset/text/sentence_piece_vocab.h"
#include "minddata/dataset/text/vectors.h"
#include "minddata/dataset/text/vocab.h"
namespace mindspore {
namespace dataset {

View File

@ -24,7 +24,7 @@
#include <vector>
#include "minddata/dataset/engine/tree_adapter.h"
#include "minddata/dataset/text/vocab.h"
#include "minddata/dataset/include/dataset/text.h"
namespace mindspore::dataset {
// Forward declare

View File

@ -28,9 +28,9 @@
#include "minddata/dataset/core/tensor.h"
#include "minddata/dataset/engine/dataset_iterator.h"
#include "minddata/dataset/engine/datasetops/pipeline_op.h"
#include "minddata/dataset/include/dataset/text.h"
#include "minddata/dataset/util/status.h"
#include "minddata/dataset/util/queue.h"
#include "minddata/dataset/text/sentence_piece_vocab.h"
#include "pybind11/pybind11.h"
namespace mindspore {
@ -54,7 +54,7 @@ class BuildSentencePieceVocabOp : public PipelineOp {
BuildSentencePieceVocabOp *s_p_vocab_ptr_;
};
BuildSentencePieceVocabOp(std::shared_ptr<SentencePieceVocab> vocab, std::vector<std::string> col_names,
BuildSentencePieceVocabOp(std::shared_ptr<dataset::SentencePieceVocab> vocab, std::vector<std::string> col_names,
int32_t vocab_size, float character_coverage, SentencePieceModel model_type,
const std::unordered_map<std::string, std::string> &params, int32_t op_conn_size);

View File

@ -179,15 +179,15 @@ Status BuildVocabOp::CollectorThread() {
});
if (special_first_) {
for (const std::string &sp_tk : special_tokens_) vocab_->append_word(sp_tk);
for (const std::string &sp_tk : special_tokens_) vocab_->AppendWord(sp_tk);
}
for (int64_t i = 0; i < num_words; i++) {
vocab_->append_word(words[i]);
vocab_->AppendWord(words[i]);
}
if (!special_first_) {
for (const std::string &sp_tk : special_tokens_) vocab_->append_word(sp_tk);
for (const std::string &sp_tk : special_tokens_) vocab_->AppendWord(sp_tk);
}
RETURN_IF_NOT_OK(out_connector_->SendEOE());

View File

@ -25,7 +25,7 @@
#include "minddata/dataset/core/tensor.h"
#include "minddata/dataset/engine/dataset_iterator.h"
#include "minddata/dataset/engine/datasetops/parallel_op.h"
#include "minddata/dataset/text/vocab.h"
#include "minddata/dataset/include/dataset/text.h"
#include "minddata/dataset/util/queue.h"
#include "minddata/dataset/util/status.h"
@ -33,9 +33,9 @@ namespace mindspore {
namespace dataset {
class BuildVocabOp : public ParallelOp<TensorRow, TensorRow> {
public:
BuildVocabOp(std::shared_ptr<Vocab> vocab, std::vector<std::string> col_names, std::pair<int64_t, int64_t> freq_range,
int64_t top_k, const std::vector<std::string> &tokens, bool prepend, int32_t num_workers,
int32_t op_connector_size);
BuildVocabOp(std::shared_ptr<dataset::Vocab> vocab, std::vector<std::string> col_names,
std::pair<int64_t, int64_t> freq_range, int64_t top_k, const std::vector<std::string> &tokens,
bool prepend, int32_t num_workers, int32_t op_connector_size);
~BuildVocabOp() = default;

View File

@ -20,6 +20,7 @@
#include <memory>
#include <optional>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
@ -30,10 +31,204 @@
namespace mindspore {
namespace dataset {
class SentencePieceVocab;
class TensorOperation;
class Vectors;
class Vocab;
using WordIdType = int32_t;
using WordType = std::string;
/// \brief Vocab object that is used to save pairs of words and ids.
/// \note It contains a map that maps each word(str) to an id(int) or reverse.
class Vocab {
public:
/// \brief Build a vocab from an unordered_map. IDs should be no duplicate and continuous.
/// \param[in] words An unordered_map containing word id pair.
/// \param[out] vocab A vocab object.
/// \return Status code.
/// \par Example
/// \code
/// // Build a map
/// std::unordered_map<std::string, int32_t> dict;
/// dict["banana"] = 0;
/// dict["apple"] = 1;
/// dict["cat"] = 2;
/// dict["dog"] = 3;
/// // Build vocab from map
/// std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
/// Status s = Vocab::BuildFromUnorderedMap(dict, &vocab);
/// \endcode
static Status BuildFromUnorderedMap(const std::unordered_map<WordType, WordIdType> &words,
std::shared_ptr<Vocab> *vocab);
/// \brief Build a vocab from a c++ vector. id no duplicate and continuous.
/// \param[in] words A vector of string containing words.
/// \param[in] special_tokens A vector of string containing special tokens.
/// \param[in] prepend_special Whether the special_tokens will be prepended/appended to vocab.
/// \param[out] vocab A vocab object.
/// \return Status code.
/// \par Example
/// \code
/// // Build vocab from a vector of words, special tokens are prepended to vocab
/// std::vector<std::string> list = {"apple", "banana", "cat", "dog", "egg"};
/// std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
/// Status s = Vocab::BuildFromVector(list, {"<unk>"}, true, &vocab);
/// \endcode
static Status BuildFromVector(const std::vector<WordType> &words, const std::vector<WordType> &special_tokens,
bool prepend_special, std::shared_ptr<Vocab> *vocab);
/// \brief Build a vocab from vocab file, IDs will be automatically assigned.
/// \param[in] path Path to vocab file, each line in file is assumed as a word (including space).
/// \param[in] delimiter Delimiter to break each line, characters after the delimiter will be deprecated.
/// \param[in] vocab_size Number of lines to be read from file.
/// \param[in] special_tokens A vector of string containing special tokens.
/// \param[in] prepend_special Whether the special_tokens will be prepended/appended to vocab.
/// \param[out] vocab A vocab object.
/// \return Status code.
/// \par Example
/// \code
/// // Build vocab from local file
/// std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt";
/// std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
/// Status s = Vocab::BuildFromFile(vocab_dir, ",", -1, {"<pad>", "<unk>"}, true, &vocab);
/// \endcode
static Status BuildFromFile(const std::string &path, const std::string &delimiter, int32_t vocab_size,
const std::vector<WordType> &special_tokens, bool prepend_special,
std::shared_ptr<Vocab> *vocab);
/// Lookup the id of a word, if the word doesn't exist in vocab, return -1.
/// \param word Word to be looked up.
/// \return ID of the word in the vocab.
/// \par Example
/// \code
/// // lookup, convert token to id
/// auto single_index = vocab->TokensToIds("home");
/// single_index = vocab->TokensToIds("hello");
/// \endcode
WordIdType TokensToIds(const WordType &word) const;
/// Lookup the id of a word, if the word doesn't exist in vocab, return -1.
/// \param words Words to be looked up.
/// \return ID of the word in the vocab.
/// \par Example
/// \code
/// // lookup multiple tokens
/// auto multi_indexs = vocab->TokensToIds(std::vector<std::string>{"<pad>", "behind"});
/// std::vector<int32_t> expected_multi_indexs = {0, 4};
/// multi_indexs = vocab->TokensToIds(std::vector<std::string>{"<pad>", "apple"});
/// expected_multi_indexs = {0, -1};
/// \endcode
std::vector<WordIdType> TokensToIds(const std::vector<WordType> &words) const;
/// Lookup the word of an ID, if ID doesn't exist in vocab, return empty string.
/// \param id ID to be looked up.
/// \return Indicates the word corresponding to the ID.
/// \par Example
/// \code
/// // reverse lookup, convert id to token
/// auto single_word = vocab->IdsToTokens(2);
/// single_word = vocab->IdsToTokens(-1);
/// \endcode
WordType IdsToTokens(const WordIdType &id);
/// Lookup the word of an ID, if ID doesn't exist in vocab, return empty string.
/// \param ids ID to be looked up.
/// \return Indicates the word corresponding to the ID.
/// \par Example
/// \code
/// // reverse lookup multiple ids
/// auto multi_words = vocab->IdsToTokens(std::vector<int32_t>{0, 4});
/// std::vector<std::string> expected_multi_words = {"<pad>", "behind"};
/// multi_words = vocab->IdsToTokens(std::vector<int32_t>{0, 99});
/// expected_multi_words = {"<pad>", ""};
/// \endcode
std::vector<WordType> IdsToTokens(const std::vector<WordIdType> &ids);
/// Constructor, shouldn't be called directly, can't be private due to std::make_unique().
/// \param map Sanitized word2id map.
explicit Vocab(std::unordered_map<WordType, WordIdType> map);
/// \brief Add one word to vocab, increment it's index automatically.
/// \param word Word to be added, word will skip if word already exists.
void AppendWord(const std::string &word);
/// \brief Return a read-only vocab in unordered_map type.
/// \return A unordered_map of word2id.
const std::unordered_map<WordType, WordIdType> &GetVocab() { return word2id_; }
/// \brief Constructor.
Vocab() = default;
/// \brief Destructor.
~Vocab() = default;
static const WordIdType kNoTokenExists;
static const WordType kNoIdExists;
private:
std::unordered_map<WordType, WordIdType> word2id_;
std::unordered_map<WordIdType, WordType> id2word_;
};
/// \brief SentencePiece object that is used to do words segmentation.
class SentencePieceVocab {
public:
/// \brief Build a SentencePiece object from a file.
/// \param[in] path_list Path to the file which contains the SentencePiece list.
/// \param[in] vocab_size Vocabulary size.
/// \param[in] character_coverage Amount of characters covered by the model, good defaults are: 0.9995 for
/// languages with rich character set like Japanese or Chinese and 1.0 for other languages with small
/// character set.
/// \param[in] model_type It can be any of [SentencePieceModel.UNIGRAM, SentencePieceModel.BPE,
/// SentencePieceModel.CHAR, SentencePieceModel.WORD], default is SentencePieceModel.UNIGRAM. The input
/// sentence must be pre-tokenized when using SentencePieceModel.WORD type.
/// - SentencePieceModel.UNIGRAM, Unigram Language Model means the next word in the sentence is assumed
/// to be independent of the previous words generated by the model.
/// - SentencePieceModel.BPE, refers to byte pair encoding algorithm, which replaces the most frequent
/// pair of bytes in a sentence with a single, unused byte.
/// - SentencePieceModel.CHAR, refers to char based sentencePiece Model type.
/// - SentencePieceModel.WORD, refers to word based sentencePiece Model type.
/// \param[in] params A dictionary with no incoming parameters(The parameters are derived from SentencePiece library).
/// \return SentencePieceVocab, vocab built from the file.
/// \par Example
/// \code
/// std::string dataset_path;
/// dataset_path = datasets_root_path_ + "/test_sentencepiece/botchan.txt";
/// std::vector<std::string> path_list;
/// path_list.emplace_back(dataset_path);
/// std::unordered_map<std::string, std::string> param_map;
/// std::shared_ptr<SentencePieceVocab> spm = std::make_unique<SentencePieceVocab>();
/// Status rc = SentencePieceVocab::BuildFromFile(path_list, 5000, 0.9995,
/// SentencePieceModel::kUnigram, param_map, &spm);
/// \endcode
static Status BuildFromFile(const std::vector<std::string> &path_list, const int32_t vocab_size,
const float character_coverage, const SentencePieceModel model_type,
const std::unordered_map<std::string, std::string> &params,
std::shared_ptr<SentencePieceVocab> *vocab);
/// \brief Save the SentencePiece model into given file path.
/// \param[in] vocab A SentencePiece object to be saved.
/// \param[in] path Path to store the model.
/// \param[in] filename The save name of model file.
/// \par Example
/// \code
/// // Save vocab model to local
/// vocab->SaveModel(&vocab, datasets_root_path_ + "/test_sentencepiece", "m.model");
/// \endcode
static Status SaveModel(const std::shared_ptr<SentencePieceVocab> *vocab, std::string path, std::string filename);
/// \brief Constructor.
SentencePieceVocab();
/// \brief Destructor.
~SentencePieceVocab() = default;
const std::string &model_proto();
void set_model_proto(const std::string model_proto);
private:
std::string model_proto_;
};
// Transform operations for text
namespace text {
@ -414,7 +609,7 @@ class MS_API NormalizeUTF8 final : public TensorTransform {
/// \brief Constructor.
/// \param[in] normalize_form Valid values can be any of [NormalizeForm::kNone,NormalizeForm::kNfc,
/// NormalizeForm::kNfkc, NormalizeForm::kNfd, NormalizeForm::kNfkd](default=NormalizeForm::kNfkc).
/// See http://unicode.org/reports/tr15/ for details.
/// See <http://unicode.org/reports/tr15/> for details.
/// - NormalizeForm.kNone, remain the input string tensor unchanged.
/// - NormalizeForm.kNfc, normalizes with Normalization Form C.
/// - NormalizeForm.kNfkc, normalizes with Normalization Form KC.

View File

@ -217,7 +217,7 @@ Status LookupOperation::ValidateParams() {
LOG_AND_RETURN_STATUS_SYNTAX_ERROR(err_msg);
}
if (unknown_token_ != std::nullopt) {
default_id_ = vocab_->Lookup(*unknown_token_);
default_id_ = vocab_->TokensToIds(*unknown_token_);
if (default_id_ == Vocab::kNoTokenExists) {
std::string err_msg = "Lookup: \"" + *unknown_token_ + "\" doesn't exist in vocab.";
LOG_AND_RETURN_STATUS_SYNTAX_ERROR(err_msg);

View File

@ -30,7 +30,7 @@ Status LookupOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<T
std::vector<WordIdType> word_ids;
word_ids.reserve(input->Size());
for (auto itr = input->begin<std::string_view>(); itr != input->end<std::string_view>(); ++itr) {
WordIdType word_id = vocab_->Lookup(std::string(*itr));
WordIdType word_id = vocab_->TokensToIds(std::string(*itr));
word_ids.emplace_back(word_id == Vocab::kNoTokenExists ? default_id_ : word_id);
CHECK_FAIL_RETURN_UNEXPECTED(word_ids.back() != Vocab::kNoTokenExists,
"Lookup: invalid data, token: \"" + std::string(*itr) +

View File

@ -23,9 +23,9 @@
#include <vector>
#include "minddata/dataset/core/tensor.h"
#include "minddata/dataset/include/dataset/text.h"
#include "minddata/dataset/kernels/tensor_op.h"
#include "minddata/dataset/util/status.h"
#include "minddata/dataset/text/vocab.h"
namespace mindspore {
namespace dataset {

View File

@ -24,10 +24,10 @@
#include <memory>
#include "minddata/dataset/include/dataset/constants.h"
#include "minddata/dataset/include/dataset/text.h"
#include "minddata/dataset/kernels/tensor_op.h"
#include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h"
#include "minddata/dataset/util/status.h"
#include "minddata/dataset/text/sentence_piece_vocab.h"
namespace mindspore {
namespace dataset {

View File

@ -46,7 +46,7 @@ Status WordpieceTokenizerOp::LookupWord(const std::string &input_token, const Ru
if (start > 0) {
word = suffix_indicator_ + word;
}
if (vocab_->Lookup(word) != Vocab::kNoTokenExists) {
if (vocab_->TokensToIds(word) != Vocab::kNoTokenExists) {
*out_found = true;
break;
}

View File

@ -23,9 +23,9 @@
#include "cppjieba/Unicode.hpp"
#include "minddata/dataset/core/tensor.h"
#include "minddata/dataset/include/dataset/text.h"
#include "minddata/dataset/kernels/tensor_op.h"
#include "minddata/dataset/text/kernels/tokenizer_op.h"
#include "minddata/dataset/text/vocab.h"
#include "minddata/dataset/util/status.h"
using cppjieba::DecodeRunesInString;

View File

@ -14,16 +14,18 @@
* limitations under the License.
*/
#include "minddata/dataset/text/sentence_piece_vocab.h"
#include <sentencepiece_trainer.h>
#include <sentencepiece_processor.h>
#include <sentencepiece_trainer.h>
#include <fstream>
#include "include/common/utils/utils.h"
#include "minddata/dataset/include/dataset/constants.h"
#include "minddata/dataset/include/dataset/text.h"
#include "minddata/dataset/util/path.h"
#include "minddata/dataset/util/status.h"
#include "utils/file_utils.h"
#include "utils/ms_utils.h"
#include "include/common/utils/utils.h"
#include "minddata/dataset/util/path.h"
namespace mindspore {
namespace dataset {

View File

@ -1,50 +0,0 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_SENTENCE_PIECE_VOCAB_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_SENTENCE_PIECE_VOCAB_H_
#include <string>
#include <memory>
#include <vector>
#include <unordered_map>
#include "minddata/dataset/util/status.h"
#include "minddata/dataset/include/dataset/constants.h"
namespace mindspore {
namespace dataset {
class SentencePieceVocab {
public:
static Status BuildFromFile(const std::vector<std::string> &path_list, const int32_t vocab_size,
const float character_coverage, const SentencePieceModel model_type,
const std::unordered_map<std::string, std::string> &params,
std::shared_ptr<SentencePieceVocab> *vocab);
static Status SaveModel(const std::shared_ptr<SentencePieceVocab> *vocab, std::string path, std::string filename);
SentencePieceVocab();
~SentencePieceVocab() = default;
const std::string &model_proto();
void set_model_proto(const std::string model_proto);
private:
std::string model_proto_;
};
} // namespace dataset
} // namespace mindspore
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_SENTENCE_PIECE_VOCAB_H_

View File

@ -14,14 +14,14 @@
* limitations under the License.
*/
#include "minddata/dataset/text/vocab.h"
#include <fstream>
#include <unordered_set>
#include <unordered_map>
#include <utility>
#include <algorithm>
#include <fstream>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include "minddata/dataset/include/dataset/text.h"
#include "minddata/dataset/util/status.h"
#include "utils/file_utils.h"
#ifndef ENABLE_ANDROID
#include "utils/log_adapter.h"
@ -33,18 +33,18 @@ namespace mindspore {
namespace dataset {
Vocab::Vocab(std::unordered_map<WordType, WordIdType> word2id) { word2id_ = std::move(word2id); }
WordIdType Vocab::Lookup(const WordType &word) const {
WordIdType Vocab::TokensToIds(const WordType &word) const {
auto itr = word2id_.find(word);
return itr == word2id_.end() ? kNoTokenExists : itr->second;
}
std::vector<WordIdType> Vocab::Lookup(const std::vector<WordType> &words) const {
std::vector<WordIdType> Vocab::TokensToIds(const std::vector<WordType> &words) const {
std::vector<WordIdType> ids;
std::transform(words.begin(), words.end(), std::back_inserter(ids), [this](auto w) { return Lookup(w); });
std::transform(words.begin(), words.end(), std::back_inserter(ids), [this](auto w) { return TokensToIds(w); });
return ids;
}
WordType Vocab::ReverseLookup(const WordIdType &id) {
WordType Vocab::IdsToTokens(const WordIdType &id) {
// lazy initialization, since I think it's not common use but waste memory
if (id2word_.empty()) {
for (const auto [word_, id_] : word2id_) {
@ -55,7 +55,7 @@ WordType Vocab::ReverseLookup(const WordIdType &id) {
return itr == id2word_.end() ? kNoIdExists : itr->second;
}
std::vector<WordType> Vocab::ReverseLookup(const std::vector<WordIdType> &ids) {
std::vector<WordType> Vocab::IdsToTokens(const std::vector<WordIdType> &ids) {
// lazy initialization, since I think it's not common use but waste memory
if (id2word_.empty()) {
for (const auto [word_, id_] : word2id_) {
@ -63,50 +63,11 @@ std::vector<WordType> Vocab::ReverseLookup(const std::vector<WordIdType> &ids) {
}
}
std::vector<WordType> words;
std::transform(ids.begin(), ids.end(), std::back_inserter(words), [this](auto i) { return ReverseLookup(i); });
std::transform(ids.begin(), ids.end(), std::back_inserter(words), [this](auto i) { return IdsToTokens(i); });
return words;
}
#ifdef ENABLE_PYTHON
Status Vocab::BuildFromPyList(const py::list &words, const py::list &special_tokens, bool prepend_special,
std::shared_ptr<Vocab> *vocab) {
if (vocab == nullptr) {
RETURN_STATUS_UNEXPECTED("Vocab::BuildFromPyList: input vocab can not be null");
}
// check of duplication on both words and special_tokens will be performed in python
// special_tokens and words both need to be unique, and shouldn't overlap
std::unordered_map<WordType, WordIdType> word2id;
// if special is added in front, normal words id will start from number of special tokens
WordIdType word_id = prepend_special ? static_cast<WordIdType>(special_tokens.size()) : 0;
for (auto word : words) {
word2id[py::str(word)] = word_id++;
}
word_id = prepend_special ? 0 : word2id.size();
for (auto special_token : special_tokens) {
word2id[py::str(special_token)] = word_id++;
}
*vocab = std::make_shared<Vocab>(std::move(word2id));
return Status::OK();
}
Status Vocab::BuildFromPyDict(const py::dict &words, std::shared_ptr<Vocab> *vocab) {
if (vocab == nullptr) {
RETURN_STATUS_UNEXPECTED("Vocab::BuildFromPyDict: input vocab can not be null");
}
std::unordered_map<WordType, WordIdType> word2id;
for (auto p : words) {
word2id[py::str(p.first)] = py::reinterpret_borrow<py::int_>(p.second);
}
*vocab = std::make_shared<Vocab>(std::move(word2id));
return Status::OK();
}
#endif
void Vocab::append_word(const std::string &word) {
void Vocab::AppendWord(const std::string &word) {
if (word2id_.find(word) == word2id_.end()) {
word2id_[word] = word2id_.size();
}
@ -161,11 +122,11 @@ Status Vocab::BuildFromVector(const std::vector<WordType> &words, const std::vec
return Status::OK();
}
Status Vocab::BuildFromFileCpp(const std::string &path, const std::string &delimiter, int32_t vocab_size,
const std::vector<WordType> &special_tokens, bool prepend_special,
std::shared_ptr<Vocab> *vocab) {
Status Vocab::BuildFromFile(const std::string &path, const std::string &delimiter, int32_t vocab_size,
const std::vector<WordType> &special_tokens, bool prepend_special,
std::shared_ptr<Vocab> *vocab) {
if (vocab == nullptr) {
RETURN_STATUS_UNEXPECTED("Vocab::BuildFromFileCpp: input vocab can not be null");
RETURN_STATUS_UNEXPECTED("Vocab::BuildFromFile: input vocab can not be null");
}
// Validate parameters
auto realpath = FileUtils::GetRealPath(path.c_str());
@ -227,56 +188,6 @@ Status Vocab::BuildFromFileCpp(const std::string &path, const std::string &delim
return Status::OK();
}
Status Vocab::BuildFromFile(const std::string &path, const std::string &delimiter, int32_t vocab_size,
const py::list &special_tokens, bool prepend_special, std::shared_ptr<Vocab> *vocab) {
if (vocab == nullptr) {
RETURN_STATUS_UNEXPECTED("Vocab::BuildFromFile: input vocab can not be null");
}
// python validator checks special_tokens doesn't contain any duplicate words
std::unordered_set<std::string> specials;
// used to check that words in file don't contain any special token that already exists
for (auto word : special_tokens) {
specials.insert(py::str(word));
}
WordIdType word_id = prepend_special ? static_cast<WordIdType>(special_tokens.size()) : 0;
std::unordered_map<WordType, WordIdType> word2id;
auto realpath = FileUtils::GetRealPath(path.c_str());
if (!realpath.has_value()) {
RETURN_STATUS_UNEXPECTED("Get real path failed, path=" + path);
}
std::fstream handle(realpath.value(), std::ios::in);
CHECK_FAIL_RETURN_UNEXPECTED(handle.good() && handle.is_open(), "from_file: fail to open:" + path);
std::string word;
while (std::getline(handle, word)) {
if (!delimiter.empty()) {
// if delimiter is not found, find_first_of would return std::string::npos which is -1
word = word.substr(0, word.find_first_of(delimiter));
}
if (word2id.find(word) != word2id.end()) {
handle.close();
RETURN_STATUS_UNEXPECTED("from_file: duplicate word:" + word + ".");
}
if (specials.find(word) != specials.end()) {
handle.close();
RETURN_STATUS_UNEXPECTED("from_file: special_tokens and word_list contain duplicate word:" + word);
}
word2id[word] = word_id++;
// break if enough row is read, if vocab_size is smaller than 0
if (word2id.size() == vocab_size) break;
}
handle.close();
word_id = prepend_special ? 0 : word2id.size();
for (auto special_token : special_tokens) {
word2id[py::str(special_token)] = word_id++;
}
*vocab = std::make_shared<Vocab>(std::move(word2id));
return Status::OK();
}
const WordIdType Vocab::kNoTokenExists = -1;
const WordType Vocab::kNoIdExists = std::string();

View File

@ -1,143 +0,0 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_VOCAB_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_VOCAB_H_
#include <string>
#include <memory>
#include <unordered_map>
#include <vector>
#include "minddata/dataset/util/status.h"
#ifdef ENABLE_PYTHON
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
#endif
namespace mindspore {
namespace dataset {
#ifdef ENABLE_PYTHON
namespace py = pybind11;
#endif
using WordIdType = int32_t;
using WordType = std::string;
class Vocab {
public:
#ifdef ENABLE_PYTHON
// Build a vocab from a python dictionary key is each word ,id needs to start from 2, no duplicate and continuous
// @param const py::dict &words - a dictionary containing word, word id pair.
// @param std::shared_ptr<Vocab> *vocab - return value, vocab object
// @return error code
static Status BuildFromPyDict(const py::dict &words, std::shared_ptr<Vocab> *vocab);
// Build a vocab from a python list, id will be assigned automatically, start from 2
// @param const py::list &words - a list of string, used to build vocab, id starts from 2
// @param std::shared_ptr<Vocab> *vocab - return value, vocab object
// @return error code
static Status BuildFromPyList(const py::list &words, const py::list &special_tokens, bool prepend_special,
std::shared_ptr<Vocab> *vocab);
// Build a vocab from reading a vocab file, id are automatically assigned, start from 2
// @param std::string &path - path to vocab file , each line is assumed to contain 1 word
// @param std::string &delimiter - delimiter to break each line with
// @param int32_t vocab_size - number of words to read from file
// @param std::shared_ptr<Vocab> *vocab - return value, vocab object
// @return error code
static Status BuildFromFile(const std::string &path, const std::string &delimiter, int32_t vocab_size,
const py::list &special_tokens, bool prepend_special, std::shared_ptr<Vocab> *vocab);
#endif
/// \brief Build a vocab from a c++ map. id needs to start from 2, no duplicate and continuous
/// \param[in] words An unordered_map containing word, word id pair.
/// \param[out] vocab A vocab object
/// \return Error code
static Status BuildFromUnorderedMap(const std::unordered_map<WordType, WordIdType> &words,
std::shared_ptr<Vocab> *vocab);
/// \brief Build a vocab from a c++ vector. id needs to start from 2, no duplicate and continuous
/// \param[in] words A vector of string, used to build vocab, id starts from 2
/// \param[in] special_tokens A vector of string contain special tokens
/// \param[in] prepend_special Whether special_tokens will be prepended/appended to vocab
/// \param[out] vocab A vocab object
/// \return Error code
static Status BuildFromVector(const std::vector<WordType> &words, const std::vector<WordType> &special_tokens,
bool prepend_special, std::shared_ptr<Vocab> *vocab);
/// \brief Build a vocab from reading a vocab file, id are automatically assigned, start from 2
/// \param[in] path Path to vocab file , each line is assumed to contain 1 word
/// \param[in] delimiter Delimiter to break each line with
/// \param[in] vocab_size Number of words to read from file
/// \param[in] special_tokens A vector of string contain special tokens
/// \param[in] prepend_special Whether special_tokens will be prepended/appended to vocab
/// \param[out] vocab A vocab object
/// \return Error code
static Status BuildFromFileCpp(const std::string &path, const std::string &delimiter, int32_t vocab_size,
const std::vector<WordType> &special_tokens, bool prepend_special,
std::shared_ptr<Vocab> *vocab);
// Lookup the id of a word, if word doesn't exist in vocab, return default_id
// @param const WordType word - word to look up
// @param WordIdType default_id - word id to return to user when its not in the vocab
// @return WordIdType, word_id
WordIdType Lookup(const WordType &word) const;
// Lookup the ids of a vector of words, if word doesn't exist in vocab, return default_id
// @param const WordType word - word to look up
// @param WordIdType default_id - word id to return to user when its not in the vocab
// @return WordIdType, word_id
std::vector<WordIdType> Lookup(const std::vector<WordType> &words) const;
// Find the word of a id, if word doesn't exist in vocab, return empty string
// @param const WordIdType id - id to reverse look up
// @return WordType, word
WordType ReverseLookup(const WordIdType &id);
// Find the words of a vector of ids, if word doesn't exist in vocab, return empty string
// @param const WordIdType id - id to reverse look up
// @return WordType, word
std::vector<WordType> ReverseLookup(const std::vector<WordIdType> &ids);
// constructor, shouldn't be called directly, can't be private due to std::make_unique()
// @param std::unordered_map<WordType, WordIdType> map - sanitized word2id map
explicit Vocab(std::unordered_map<WordType, WordIdType> map);
Vocab() = default;
// add one word to vocab, increment it's index automatically
// @param std::string & word - word to be added will skip if word already exists
void append_word(const std::string &word);
// return a read-only vocab
const std::unordered_map<WordType, WordIdType> vocab() { return word2id_; }
// destructor
~Vocab() = default;
static const WordIdType kNoTokenExists;
static const WordType kNoIdExists;
private:
std::unordered_map<WordType, WordIdType> word2id_;
std::unordered_map<WordIdType, WordType> id2word_;
};
} // namespace dataset
} // namespace mindspore
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_VOCAB_H_

View File

@ -47,7 +47,7 @@ import numpy as np
import mindspore._c_dataengine as cde
from mindspore.common import dtype as mstype
from .utils import JiebaMode, NormalizeForm, to_str, SPieceTokenizerOutType, SPieceTokenizerLoadType
from .utils import JiebaMode, NormalizeForm, to_str, SPieceTokenizerOutType, SPieceTokenizerLoadType, SentencePieceVocab
from .validators import check_lookup, check_jieba_add_dict, check_to_vectors, \
check_jieba_add_word, check_jieba_init, check_with_offsets, check_unicode_script_tokenizer, \
check_wordpiece_tokenizer, check_regex_replace, check_regex_tokenizer, check_basic_tokenizer, check_ngram, \
@ -386,6 +386,7 @@ class SentencePieceTokenizer(TextTensorOperation):
self.out_type = out_type
def parse(self):
self.mode = self.mode.c_sentence_piece_vocab if isinstance(self.mode, SentencePieceVocab) else self.mode
return cde.SentencePieceTokenizerOperation(self.mode, DE_C_INTER_SENTENCEPIECE_OUTTYPE[self.out_type])

View File

@ -141,7 +141,7 @@ class Vocab:
>>> dataset = dataset.map(operations=text.Lookup(vocab, "<unk>"), input_columns=["text"])
"""
vocab = Vocab()
vocab = cls()
vocab.c_vocab = dataset.build_vocab(columns, freq_range, top_k, special_tokens, special_first)
return vocab
@ -211,7 +211,7 @@ class Vocab:
vocab_size = -1
if special_tokens is None:
special_tokens = []
vocab = Vocab()
vocab = cls()
vocab.c_vocab = cde.Vocab.from_file(file_path, delimiter, vocab_size, special_tokens, special_first)
return vocab
@ -232,16 +232,19 @@ class Vocab:
>>> vocab = text.Vocab.from_dict({"home": 3, "behind": 2, "the": 4, "world": 5, "<unk>": 6})
"""
vocab = Vocab()
vocab = cls()
vocab.c_vocab = cde.Vocab.from_dict(word_dict)
return vocab
class SentencePieceVocab(cde.SentencePieceVocab):
class SentencePieceVocab:
"""
SentencePiece object that is used to do words segmentation.
"""
def __init__(self):
self.c_sentence_piece_vocab = None
@classmethod
@check_from_dataset_sentencepiece
def from_dataset(cls, dataset, col_names, vocab_size, character_coverage, model_type, params):
@ -278,8 +281,11 @@ class SentencePieceVocab(cde.SentencePieceVocab):
... SentencePieceModel.UNIGRAM, {})
"""
return dataset.build_sentencepiece_vocab(col_names, vocab_size, character_coverage,
model_type, params)
sentence_piece_vocab = cls()
sentence_piece_vocab.c_sentence_piece_vocab = dataset.build_sentencepiece_vocab(col_names, vocab_size,
character_coverage,
model_type, params)
return sentence_piece_vocab
@classmethod
@check_from_file_sentencepiece
@ -321,8 +327,11 @@ class SentencePieceVocab(cde.SentencePieceVocab):
... SentencePieceModel.UNIGRAM, {})
"""
return super().from_file(file_path, vocab_size, character_coverage,
DE_C_INTER_SENTENCEPIECE_MODE[model_type], params)
sentence_piece_vocab = cls()
sentence_piece_vocab.c_sentence_piece_vocab = \
cde.SentencePieceVocab.from_file(file_path, vocab_size, character_coverage,
DE_C_INTER_SENTENCEPIECE_MODE[model_type], params)
return sentence_piece_vocab
@classmethod
@check_save_model
@ -342,7 +351,7 @@ class SentencePieceVocab(cde.SentencePieceVocab):
>>> text.SentencePieceVocab.save_model(vocab, "./", "m.model")
"""
super().save_model(vocab, path, filename)
cde.SentencePieceVocab.save_model(vocab.c_sentence_piece_vocab, path, filename)
def to_str(array, encoding='utf8'):

View File

@ -551,7 +551,7 @@ def check_save_model(method):
[vocab, path, filename], _ = parse_user_args(method, *args, **kwargs)
if vocab is not None:
type_check(vocab, (cde.SentencePieceVocab,), "vocab")
type_check(vocab, (text.SentencePieceVocab,), "vocab")
if path is not None:
type_check(path, (str,), "path")
@ -573,7 +573,7 @@ def check_sentence_piece_tokenizer(method):
def new_method(self, *args, **kwargs):
[mode, out_type], _ = parse_user_args(method, *args, **kwargs)
type_check(mode, (str, cde.SentencePieceVocab), "mode is not an instance of str or cde.SentencePieceVocab.")
type_check(mode, (str, text.SentencePieceVocab), "mode is not an instance of str or text.SentencePieceVocab.")
type_check(out_type, (SPieceTokenizerOutType,), "out_type is not an instance of SPieceTokenizerOutType")
return method(self, *args, **kwargs)

View File

@ -20,7 +20,7 @@
#include "common/common.h"
#include "include/api/status.h"
#include "minddata/dataset/text/vocab.h"
#include "minddata/dataset/include/dataset/text.h"
using mindspore::dataset::Tensor;
using mindspore::dataset::Vocab;
@ -47,7 +47,7 @@ TEST_F(MindDataTestVocab, TestVocabFromUnorderedMap) {
std::vector<std::string> words = {"apple", "dog", "egg"};
std::vector<int64_t> expected = {1, 3, -1};
for (uint32_t i = 0; i < words.size(); ++i) {
int32_t x = vocab->Lookup(words[i]);
int32_t x = vocab->TokensToIds(words[i]);
EXPECT_EQ(x, expected[i]);
}
}
@ -65,7 +65,7 @@ TEST_F(MindDataTestVocab, TestVocabFromEmptyMap) {
std::vector<std::string> words = {"apple", "dog", "egg"};
std::vector<int64_t> expected = {-1, -1, -1};
for (uint32_t i = 0; i < words.size(); ++i) {
int32_t x = vocab->Lookup(words[i]);
int32_t x = vocab->TokensToIds(words[i]);
EXPECT_EQ(x, expected[i]);
}
}
@ -96,7 +96,7 @@ TEST_F(MindDataTestVocab, TestVocabFromVectorPrependSpTokens) {
std::vector<std::string> words = {"apple", "banana", "fox"};
std::vector<int64_t> expected = {1, 2, -1};
for (uint32_t i = 0; i < words.size(); ++i) {
int32_t x = vocab->Lookup(words[i]);
int32_t x = vocab->TokensToIds(words[i]);
EXPECT_EQ(x, expected[i]);
}
}
@ -113,7 +113,7 @@ TEST_F(MindDataTestVocab, TestVocabFromVectorAppendSpTokens) {
std::vector<std::string> words = {"apple", "<unk>", "fox"};
std::vector<int64_t> expected = {0, 5, -1};
for (uint32_t i = 0; i < words.size(); ++i) {
int32_t x = vocab->Lookup(words[i]);
int32_t x = vocab->TokensToIds(words[i]);
EXPECT_EQ(x, expected[i]);
}
}
@ -131,7 +131,7 @@ TEST_F(MindDataTestVocab, TestVocabFromVectorWithNoSpTokens) {
std::vector<std::string> words = {"apple", "banana", "fox", "<pad>"};
std::vector<int64_t> expected = {0, 1, -1, -1};
for (uint32_t i = 0; i < words.size(); ++i) {
int32_t x = vocab->Lookup(words[i]);
int32_t x = vocab->TokensToIds(words[i]);
EXPECT_EQ(x, expected[i]);
}
}
@ -149,7 +149,7 @@ TEST_F(MindDataTestVocab, TestVocabFromEmptyVector) {
std::vector<std::string> words = {"apple", "banana", "fox"};
std::vector<int64_t> expected = {-1, -1, -1};
for (uint32_t i = 0; i < words.size(); ++i) {
int32_t x = vocab->Lookup(words[i]);
int32_t x = vocab->TokensToIds(words[i]);
EXPECT_EQ(x, expected[i]);
}
}
@ -195,14 +195,14 @@ TEST_F(MindDataTestVocab, TestVocabFromFile) {
// Build vocab from local file
std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt";
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {"<pad>", "<unk>"}, true, &vocab);
Status s = Vocab::BuildFromFile(vocab_dir, ",", -1, {"<pad>", "<unk>"}, true, &vocab);
EXPECT_EQ(s, Status::OK());
// Look up specified words
std::vector<std::string> words = {"not", "all"};
std::vector<int64_t> expected = {2, 3};
for (uint32_t i = 0; i < words.size(); ++i) {
int32_t x = vocab->Lookup(words[i]);
int32_t x = vocab->TokensToIds(words[i]);
EXPECT_EQ(x, expected[i]);
}
}
@ -212,7 +212,7 @@ TEST_F(MindDataTestVocab, TestVocabFromFileFail1) {
// Build vocab from local file which is not exist
std::string vocab_dir = datasets_root_path_ + "/testVocab/not_exist.txt";
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {}, true, &vocab);
Status s = Vocab::BuildFromFile(vocab_dir, ",", -1, {}, true, &vocab);
EXPECT_NE(s, Status::OK());
}
@ -223,7 +223,7 @@ TEST_F(MindDataTestVocab, TestVocabFromFileFail2) {
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
// Expected failure: vocab_size should be either -1 or positive integer
Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -2, {}, true, &vocab);
Status s = Vocab::BuildFromFile(vocab_dir, ",", -2, {}, true, &vocab);
EXPECT_NE(s, Status::OK());
}
@ -234,7 +234,7 @@ TEST_F(MindDataTestVocab, TestVocabFromFileFail3) {
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
// Expected failure: duplicate special token <unk>
Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {"<unk>", "<unk>"}, true, &vocab);
Status s = Vocab::BuildFromFile(vocab_dir, ",", -1, {"<unk>", "<unk>"}, true, &vocab);
EXPECT_NE(s, Status::OK());
}
@ -245,6 +245,6 @@ TEST_F(MindDataTestVocab, TestVocabFromFileFail4) {
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
// Expected failure: special_tokens and word_list contain duplicate word
Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {"home"}, true, &vocab);
Status s = Vocab::BuildFromFile(vocab_dir, ",", -1, {"home"}, true, &vocab);
EXPECT_NE(s, Status::OK());
}

View File

@ -23,7 +23,6 @@
#include "minddata/dataset/include/dataset/datasets.h"
#include "minddata/dataset/include/dataset/text.h"
#include "minddata/dataset/include/dataset/transforms.h"
#include "minddata/dataset/text/sentence_piece_vocab.h"
using namespace mindspore::dataset;
using mindspore::dataset::SentencePieceModel;

View File

@ -27,7 +27,6 @@
#include "minddata/dataset/text/fast_text.h"
#include "minddata/dataset/text/glove.h"
#include "minddata/dataset/text/vectors.h"
#include "minddata/dataset/text/vocab.h"
using namespace mindspore::dataset;
using mindspore::Status;
@ -797,7 +796,7 @@ TEST_F(MindDataTestPipeline, TestFilterWikipediaXMLSuccess) {
// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));
std::vector<std::string> expected = {"welcome to beijing","",""};
std::vector<std::string> expected = {"welcome to beijing", "", ""};
uint64_t i = 0;
@ -806,7 +805,7 @@ TEST_F(MindDataTestPipeline, TestFilterWikipediaXMLSuccess) {
std::shared_ptr<Tensor> de_expected_tensor;
ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
mindspore::MSTensor ms_expected_tensor =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
ASSERT_OK(iter->GetNextRow(&row));
i++;
@ -1709,8 +1708,7 @@ TEST_F(MindDataTestPipeline, TestToNumberFail1) {
EXPECT_NE(ds, nullptr);
// Create ToNumber operation on ds
std::shared_ptr<TensorTransform> to_number =
std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt8);
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt8);
EXPECT_NE(to_number, nullptr);
// Create a Map operation on ds
@ -1760,7 +1758,8 @@ TEST_F(MindDataTestPipeline, TestToNumberFail2) {
EXPECT_NE(ds, nullptr);
// Create ToNumber operation on ds
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeFloat16);
std::shared_ptr<TensorTransform> to_number =
std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeFloat16);
EXPECT_NE(to_number, nullptr);
// Create a Map operation on ds
@ -2143,8 +2142,7 @@ TEST_F(MindDataTestPipeline, TestNgramSuccess1) {
ASSERT_OK(iter->GetNextRow(&row));
std::vector<std::vector<std::string>> expected = {
{"&-This", "This-is", "is-a", "a-text", "text-file.", "file.-&", "&-&-This", "&-This-is", "This-is-a",
"is-a-text",
{"&-This", "This-is", "is-a", "a-text", "text-file.", "file.-&", "&-&-This", "&-This-is", "This-is-a", "is-a-text",
"a-text-file.", "text-file.-&", "file.-&-&"},
{"&-Be", "Be-happy", "happy-every", "every-day.", "day.-&", "&-&-Be", "&-Be-happy", "Be-happy-every",
"happy-every-day.", "every-day.-&", "day.-&-&"},
@ -4371,8 +4369,7 @@ TEST_F(MindDataTestPipeline, TestGloVeDefaultParam) {
Status s = GloVe::BuildFromFile(&glove, vectors_dir);
EXPECT_EQ(s, Status::OK());
std::shared_ptr<TensorTransform> lookup =
std::make_shared<text::ToVectors>(glove);
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(glove);
EXPECT_NE(lookup, nullptr);
// Create Map operation on ds
@ -4388,14 +4385,13 @@ TEST_F(MindDataTestPipeline, TestGloVeDefaultParam) {
ASSERT_OK(iter->GetNextRow(&row));
uint64_t i = 0;
std::vector<std::vector<float>> expected = {
{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
{0, 0, 0, 0, 0, 0},
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
{0, 0, 0, 0, 0, 0}};
std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
{0, 0, 0, 0, 0, 0},
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
{0, 0, 0, 0, 0, 0}};
while (row.size() != 0) {
auto ind = row["text"];
MS_LOG(INFO) << ind.Shape();
@ -4434,8 +4430,7 @@ TEST_F(MindDataTestPipeline, TestGloVeAllBuildfromfileParams) {
Status s = GloVe::BuildFromFile(&glove, vectors_dir, 100);
EXPECT_EQ(s, Status::OK());
std::shared_ptr<TensorTransform> lookup =
std::make_shared<text::ToVectors>(glove);
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(glove);
EXPECT_NE(lookup, nullptr);
// Create Map operation on ds
@ -4451,14 +4446,13 @@ TEST_F(MindDataTestPipeline, TestGloVeAllBuildfromfileParams) {
ASSERT_OK(iter->GetNextRow(&row));
uint64_t i = 0;
std::vector<std::vector<float>> expected = {
{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
{0, 0, 0, 0, 0, 0},
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
{0, 0, 0, 0, 0, 0}};
std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
{0, 0, 0, 0, 0, 0},
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
{0, 0, 0, 0, 0, 0}};
while (row.size() != 0) {
auto ind = row["text"];
MS_LOG(INFO) << ind.Shape();
@ -4498,8 +4492,7 @@ TEST_F(MindDataTestPipeline, TestGloVeUnknownInit) {
EXPECT_EQ(s, Status::OK());
std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
std::shared_ptr<TensorTransform> lookup =
std::make_shared<text::ToVectors>(glove, unknown_init);
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(glove, unknown_init);
EXPECT_NE(lookup, nullptr);
// Create Map operation on ds
@ -4515,14 +4508,13 @@ TEST_F(MindDataTestPipeline, TestGloVeUnknownInit) {
ASSERT_OK(iter->GetNextRow(&row));
uint64_t i = 0;
std::vector<std::vector<float>> expected = {
{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
{-1, -1, -1, -1, -1, -1},
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
{-1, -1, -1, -1, -1, -1}};
std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
{-1, -1, -1, -1, -1, -1},
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
{-1, -1, -1, -1, -1, -1}};
while (row.size() != 0) {
auto ind = row["text"];
MS_LOG(INFO) << ind.Shape();
@ -4562,8 +4554,7 @@ TEST_F(MindDataTestPipeline, TestGloVeAllParams) {
EXPECT_EQ(s, Status::OK());
std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
std::shared_ptr<TensorTransform> lookup =
std::make_shared<text::ToVectors>(glove, unknown_init, true);
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(glove, unknown_init, true);
EXPECT_NE(lookup, nullptr);
// Create Map operation on ds
@ -4579,14 +4570,13 @@ TEST_F(MindDataTestPipeline, TestGloVeAllParams) {
ASSERT_OK(iter->GetNextRow(&row));
uint64_t i = 0;
std::vector<std::vector<float>> expected = {
{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
{-1, -1, -1, -1, -1, -1},
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
{-1, -1, -1, -1, -1, -1}};
std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
{-1, -1, -1, -1, -1, -1},
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
{-1, -1, -1, -1, -1, -1}};
while (row.size() != 0) {
auto ind = row["text"];
MS_LOG(INFO) << ind.Shape();
@ -4748,13 +4738,13 @@ TEST_F(MindDataTestPipeline, TestCharNGramDefaultParam) {
ASSERT_OK(iter->GetNextRow(&row));
uint64_t i = 0;
std::vector<std::vector<float>> expected = {{0,0,0,0,0},
{0,0,0,0,0},
{0.117336,0.362446,-0.983326,0.939264,-0.05648},
{0.657201,2.11761,-1.59276,0.432072,1.21395},
{0,0,0,0,0},
{-2.26956,0.288491,-0.740001,0.661703,0.147355},
{0,0,0,0,0}};
std::vector<std::vector<float>> expected = {{0, 0, 0, 0, 0},
{0, 0, 0, 0, 0},
{0.117336, 0.362446, -0.983326, 0.939264, -0.05648},
{0.657201, 2.11761, -1.59276, 0.432072, 1.21395},
{0, 0, 0, 0, 0},
{-2.26956, 0.288491, -0.740001, 0.661703, 0.147355},
{0, 0, 0, 0, 0}};
while (row.size() != 0) {
auto ind = row["text"];
MS_LOG(INFO) << ind.Shape();
@ -4810,13 +4800,13 @@ TEST_F(MindDataTestPipeline, TestCharNGramAllBuildfromfileParams) {
ASSERT_OK(iter->GetNextRow(&row));
uint64_t i = 0;
std::vector<std::vector<float>> expected = {{0,0,0,0,0},
{0,0,0,0,0},
{-0.155665,0.664073,-0.538499,1.22657,-0.2162},
{0.657201,2.11761,-1.59276,0.432072,1.21395},
{0,0,0,0,0},
{-2.26956,0.288491,-0.740001,0.661703,0.147355},
{0,0,0,0,0}};
std::vector<std::vector<float>> expected = {{0, 0, 0, 0, 0},
{0, 0, 0, 0, 0},
{-0.155665, 0.664073, -0.538499, 1.22657, -0.2162},
{0.657201, 2.11761, -1.59276, 0.432072, 1.21395},
{0, 0, 0, 0, 0},
{-2.26956, 0.288491, -0.740001, 0.661703, 0.147355},
{0, 0, 0, 0, 0}};
while (row.size() != 0) {
auto ind = row["text"];
MS_LOG(INFO) << ind.Shape();
@ -4873,13 +4863,13 @@ TEST_F(MindDataTestPipeline, TestCharNGramUnknownInit) {
ASSERT_OK(iter->GetNextRow(&row));
uint64_t i = 0;
std::vector<std::vector<float>> expected = {{-1,-1,-1,-1,-1},
{-1,-1,-1,-1,-1},
{-0.155665,0.664073,-0.538499,1.22657,-0.2162},
{0.657201,2.11761,-1.59276,0.432072,1.21395},
{-1,-1,-1,-1,-1},
{-2.26956,0.288491,-0.740001,0.661703,0.147355},
{-1,-1,-1,-1,-1}};
std::vector<std::vector<float>> expected = {{-1, -1, -1, -1, -1},
{-1, -1, -1, -1, -1},
{-0.155665, 0.664073, -0.538499, 1.22657, -0.2162},
{0.657201, 2.11761, -1.59276, 0.432072, 1.21395},
{-1, -1, -1, -1, -1},
{-2.26956, 0.288491, -0.740001, 0.661703, 0.147355},
{-1, -1, -1, -1, -1}};
while (row.size() != 0) {
auto ind = row["text"];
MS_LOG(INFO) << ind.Shape();
@ -4936,13 +4926,13 @@ TEST_F(MindDataTestPipeline, TestCharNGramAllParams) {
ASSERT_OK(iter->GetNextRow(&row));
uint64_t i = 0;
std::vector<std::vector<float>> expected = {{-1,-1,-1,-1,-1},
{-1,-1,-1,-1,-1},
{0.117336,0.362446,-0.983326,0.939264,-0.05648},
{0.657201,2.11761,-1.59276,0.432072,1.21395},
{-1,-1,-1,-1,-1},
{-2.26956,0.288491,-0.740001,0.661703,0.147355},
{-1,-1,-1,-1,-1}};
std::vector<std::vector<float>> expected = {{-1, -1, -1, -1, -1},
{-1, -1, -1, -1, -1},
{0.117336, 0.362446, -0.983326, 0.939264, -0.05648},
{0.657201, 2.11761, -1.59276, 0.432072, 1.21395},
{-1, -1, -1, -1, -1},
{-2.26956, 0.288491, -0.740001, 0.661703, 0.147355},
{-1, -1, -1, -1, -1}};
while (row.size() != 0) {
auto ind = row["text"];
MS_LOG(INFO) << ind.Shape();

View File

@ -22,7 +22,6 @@
#include "minddata/dataset/include/dataset/datasets.h"
#include "minddata/dataset/include/dataset/text.h"
#include "minddata/dataset/include/dataset/transforms.h"
#include "minddata/dataset/text/vocab.h"
using namespace mindspore::dataset;
using mindspore::Status;
@ -42,7 +41,7 @@ class MindDataTestPipeline : public UT::DatasetOpTesting {
} while (false)
/// Feature: C++ text.Vocab class.
/// Description: test Lookup() ReverseLookup() methods of text::Vocab.
/// Description: test TokensToIds() IdsToTokens() methods of text::Vocab.
/// Expectation: success.
TEST_F(MindDataTestPipeline, TestVocabLookupAndReverseLookup) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupAndReverseLookup.";
@ -53,30 +52,30 @@ TEST_F(MindDataTestPipeline, TestVocabLookupAndReverseLookup) {
EXPECT_EQ(s, Status::OK());
// lookup, convert token to id
auto single_index = vocab->Lookup("home");
auto single_index = vocab->TokensToIds("home");
EXPECT_EQ(single_index, 2);
single_index = vocab->Lookup("hello");
single_index = vocab->TokensToIds("hello");
EXPECT_EQ(single_index, -1);
// lookup multiple tokens
auto multi_indexs = vocab->Lookup(std::vector<std::string>{"<pad>", "behind"});
auto multi_indexs = vocab->TokensToIds(std::vector<std::string>{"<pad>", "behind"});
std::vector<int32_t> expected_multi_indexs = {0, 4};
EXPECT_EQ(multi_indexs, expected_multi_indexs);
multi_indexs = vocab->Lookup(std::vector<std::string>{"<pad>", "apple"});
multi_indexs = vocab->TokensToIds(std::vector<std::string>{"<pad>", "apple"});
expected_multi_indexs = {0, -1};
EXPECT_EQ(multi_indexs, expected_multi_indexs);
// reverse lookup, convert id to token
auto single_word = vocab->ReverseLookup(2);
auto single_word = vocab->IdsToTokens(2);
EXPECT_EQ(single_word, "home");
single_word = vocab->ReverseLookup(-1);
single_word = vocab->IdsToTokens(-1);
EXPECT_EQ(single_word, "");
// reverse lookup multiple ids
auto multi_words = vocab->ReverseLookup(std::vector<int32_t>{0, 4});
auto multi_words = vocab->IdsToTokens(std::vector<int32_t>{0, 4});
std::vector<std::string> expected_multi_words = {"<pad>", "behind"};
EXPECT_EQ(multi_words, expected_multi_words);
multi_words = vocab->ReverseLookup(std::vector<int32_t>{0, 99});
multi_words = vocab->IdsToTokens(std::vector<int32_t>{0, 99});
expected_multi_words = {"<pad>", ""};
EXPECT_EQ(multi_words, expected_multi_words);
}
@ -330,7 +329,7 @@ TEST_F(MindDataTestPipeline, TestVocabFromDataset) {
EXPECT_NE(vocab, nullptr);
// Check if vocab has words or not
int32_t home_index = vocab->Lookup("home");
int32_t home_index = vocab->TokensToIds("home");
EXPECT_EQ(home_index, 4);
// Create Lookup operation on ds
@ -386,7 +385,7 @@ TEST_F(MindDataTestPipeline, TestVocabFromDatasetDefault) {
EXPECT_NE(vocab, nullptr);
// Check if vocab has words or not
int32_t home_index = vocab->Lookup("home");
int32_t home_index = vocab->TokensToIds("home");
EXPECT_EQ(home_index, 2);
// Create Lookup operation on ds
@ -509,7 +508,7 @@ TEST_F(MindDataTestPipeline, TestVocabFromDatasetInt64) {
EXPECT_NE(vocab, nullptr);
// Check if vocab has words or not
int32_t home_index = vocab->Lookup("home");
int32_t home_index = vocab->TokensToIds("home");
EXPECT_EQ(home_index, 2);
// Create Lookup operation on ds

View File

@ -19,7 +19,7 @@
#include "common/common.h"
#include "minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h"
#include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
#include "minddata/dataset/text/sentence_piece_vocab.h"
#include "minddata/dataset/include/dataset/text.h"
#include "minddata/dataset/engine/datasetops/source/text_file_op.h"
#include "gtest/gtest.h"
#include "utils/log_adapter.h"