From 7b15e5a742b346787240e2d2897d002b4884be6e Mon Sep 17 00:00:00 2001 From: Zirui Wu Date: Mon, 22 Jun 2020 11:03:47 -0400 Subject: [PATCH] rework on lookup add test caser fix ci address review cmts ci addr review cmt fix typo address review cmts add 2 more test cases cpplint fix addr cpplint addr ci fix tst case err fix doc str --- .../ccsrc/dataset/api/python_bindings.cc | 21 ++++++++--- .../ccsrc/dataset/text/kernels/lookup_op.cc | 8 +++-- .../text/kernels/wordpiece_tokenizer_op.cc | 3 +- mindspore/ccsrc/dataset/text/vocab.cc | 7 ++-- mindspore/ccsrc/dataset/text/vocab.h | 9 ++--- mindspore/dataset/text/transforms.py | 12 +++---- mindspore/dataset/text/validators.py | 10 +++--- mindspore/dataset/transforms/c_transforms.py | 2 +- tests/ut/python/dataset/test_from_dataset.py | 8 ++--- tests/ut/python/dataset/test_nlp.py | 2 +- tests/ut/python/dataset/test_vocab.py | 35 ++++++++++++------- 11 files changed, 70 insertions(+), 47 deletions(-) diff --git a/mindspore/ccsrc/dataset/api/python_bindings.cc b/mindspore/ccsrc/dataset/api/python_bindings.cc index 0ae64db6713..aa9f7af046d 100644 --- a/mindspore/ccsrc/dataset/api/python_bindings.cc +++ b/mindspore/ccsrc/dataset/api/python_bindings.cc @@ -609,10 +609,23 @@ void bindTokenizerOps(py::module *m) { *m, "UnicodeCharTokenizerOp", "Tokenize a scalar tensor of UTF-8 string to Unicode characters.") .def(py::init<>()); (void)py::class_>(*m, "LookupOp", - "Tensor operation to LookUp each word") - .def(py::init, WordIdType>(), py::arg("vocab"), py::arg("unknown")) - .def(py::init>(), py::arg("vocab")); - (void)py::class_>(*m, "NgramOp", "TensorOp performs ngram mapping") + "Tensor operation to LookUp each word.") + .def(py::init([](std::shared_ptr vocab, const py::object &py_word) { + if (vocab == nullptr) { + THROW_IF_ERROR(Status(StatusCode::kUnexpectedError, "vocab object type is incorrect or null.")); + } + if (py_word.is_none()) { + return std::make_shared(vocab, Vocab::kNoTokenExists); + } + std::string word = py::reinterpret_borrow(py_word); + WordIdType default_id = vocab->Lookup(word); + if (default_id == Vocab::kNoTokenExists) { + THROW_IF_ERROR( + Status(StatusCode::kUnexpectedError, "default unknown token:" + word + " doesn't exist in vocab.")); + } + return std::make_shared(vocab, default_id); + })); + (void)py::class_>(*m, "NgramOp", "TensorOp performs ngram mapping.") .def(py::init &, int32_t, int32_t, const std::string &, const std::string &, const std::string &>(), py::arg("ngrams"), py::arg("l_pad_len"), py::arg("r_pad_len"), py::arg("l_pad_token"), py::arg("r_pad_token"), diff --git a/mindspore/ccsrc/dataset/text/kernels/lookup_op.cc b/mindspore/ccsrc/dataset/text/kernels/lookup_op.cc index 07cf7aef5c0..1793301e1d3 100644 --- a/mindspore/ccsrc/dataset/text/kernels/lookup_op.cc +++ b/mindspore/ccsrc/dataset/text/kernels/lookup_op.cc @@ -26,11 +26,15 @@ LookupOp::LookupOp(std::shared_ptr vocab, WordIdType default_id) Status LookupOp::Compute(const std::shared_ptr &input, std::shared_ptr *output) { IO_CHECK(input, output); RETURN_UNEXPECTED_IF_NULL(vocab_); - CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING, "None String Tensor"); + CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING, "None String Tensor."); std::vector word_ids; word_ids.reserve(input->Size()); for (auto itr = input->begin(); itr != input->end(); itr++) { - word_ids.push_back(vocab_->Lookup(std::string(*itr), default_id_)); + WordIdType word_id = vocab_->Lookup(std::string(*itr)); + word_ids.emplace_back(word_id == Vocab::kNoTokenExists ? default_id_ : word_id); + CHECK_FAIL_RETURN_UNEXPECTED( + word_ids.back() != Vocab::kNoTokenExists, + "Lookup Error: token" + std::string(*itr) + "doesn't exist in vocab and no unknown token is specified."); } RETURN_IF_NOT_OK(Tensor::CreateTensor(output, TensorImpl::kFlexible, input->shape(), type_, diff --git a/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.cc b/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.cc index e488c527cd4..e7ff0cc1ee3 100644 --- a/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.cc +++ b/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.cc @@ -43,8 +43,7 @@ Status WordpieceTokenizerOp::LookupWord(const std::string &input_token, const Ru if (start > 0) { word = suffix_indicator_ + word; } - WordIdType default_id = -1; - if (vocab_->Lookup(word, default_id) != default_id) { + if (vocab_->Lookup(word) != Vocab::kNoTokenExists) { *out_found = true; break; } diff --git a/mindspore/ccsrc/dataset/text/vocab.cc b/mindspore/ccsrc/dataset/text/vocab.cc index 100dc9d6558..399a9dee37a 100644 --- a/mindspore/ccsrc/dataset/text/vocab.cc +++ b/mindspore/ccsrc/dataset/text/vocab.cc @@ -24,9 +24,9 @@ namespace mindspore { namespace dataset { Vocab::Vocab(std::unordered_map word2id) { word2id_ = std::move(word2id); } -WordIdType Vocab::Lookup(const WordType &word, WordIdType default_id) const { +WordIdType Vocab::Lookup(const WordType &word) const { auto itr = word2id_.find(word); - return itr == word2id_.end() ? default_id : itr->second; + return itr == word2id_.end() ? kNoTokenExists : itr->second; } Status Vocab::BuildFromPyList(const py::list &words, const py::list &special_tokens, bool prepend_special, @@ -100,5 +100,8 @@ void Vocab::append_word(const std::string &word) { word2id_[word] = word2id_.size(); } } + +const WordIdType Vocab::kNoTokenExists = -1; + } // namespace dataset } // namespace mindspore diff --git a/mindspore/ccsrc/dataset/text/vocab.h b/mindspore/ccsrc/dataset/text/vocab.h index fc21c380a2b..410b0aeeca9 100644 --- a/mindspore/ccsrc/dataset/text/vocab.h +++ b/mindspore/ccsrc/dataset/text/vocab.h @@ -61,12 +61,7 @@ class Vocab { // @param const WordType word - word to look up // @param WordIdType default_id - word id to return to user when its not in the vocab // @return WordIdType, word_id - WordIdType Lookup(const WordType &word, WordIdType default_id) const; - - // reverse lookup, lookup the word based on its id - // @param WordIdType id - word id to lookup to - // @return WordType the word - WordType Lookup(WordIdType id); + WordIdType Lookup(const WordType &word) const; // constructor, shouldn't be called directly, can't be private due to std::make_unique() // @param std::unordered_map map - sanitized word2id map @@ -81,6 +76,8 @@ class Vocab { // destructor ~Vocab() = default; + static const WordIdType kNoTokenExists; + private: std::unordered_map word2id_; }; diff --git a/mindspore/dataset/text/transforms.py b/mindspore/dataset/text/transforms.py index f829e4ba737..7d79461b0f2 100644 --- a/mindspore/dataset/text/transforms.py +++ b/mindspore/dataset/text/transforms.py @@ -63,17 +63,13 @@ class Lookup(cde.LookupOp): Args: vocab(Vocab): a Vocab object. - unknown(int, optional): default id to lookup a word that is out of vocab. If no argument is passed, 1 will be - used to be the default id which is the convention for unknown_token . Otherwise, user is strongly - encouraged to pass in the id for (default=None). + unknown_token(str, optional): word to use for lookup if the word being looked up is out of Vocabulary (oov). + If unknown_token is oov, runtime error will be thrown (default=None). """ @check_lookup - def __init__(self, vocab, unknown=None): - if unknown is None: - super().__init__(vocab) - else: - super().__init__(vocab, unknown) + def __init__(self, vocab, unknown_token=None): + super().__init__(vocab, unknown_token) class Ngram(cde.NgramOp): diff --git a/mindspore/dataset/text/validators.py b/mindspore/dataset/text/validators.py index 39a0c4e6320..b2ee506629d 100644 --- a/mindspore/dataset/text/validators.py +++ b/mindspore/dataset/text/validators.py @@ -22,7 +22,7 @@ import mindspore.common.dtype as mstype import mindspore._c_dataengine as cde from mindspore._c_expression import typing -from ..core.validator_helpers import parse_user_args, type_check, type_check_list, check_uint32, check_positive, \ +from ..core.validator_helpers import parse_user_args, type_check, type_check_list, check_uint32, \ INT32_MAX, check_value @@ -44,11 +44,11 @@ def check_lookup(method): @wraps(method) def new_method(self, *args, **kwargs): - [vocab, unknown], _ = parse_user_args(method, *args, **kwargs) + [vocab, unknown_token], _ = parse_user_args(method, *args, **kwargs) + + if unknown_token is not None: + type_check(unknown_token, (str,), "unknown_token") - if unknown is not None: - type_check(unknown, (int,), "unknown") - check_positive(unknown) type_check(vocab, (cde.Vocab,), "vocab is not an instance of cde.Vocab.") return method(self, *args, **kwargs) diff --git a/mindspore/dataset/transforms/c_transforms.py b/mindspore/dataset/transforms/c_transforms.py index 48e986202c3..62496822e51 100644 --- a/mindspore/dataset/transforms/c_transforms.py +++ b/mindspore/dataset/transforms/c_transforms.py @@ -197,7 +197,7 @@ class PadEnd(cde.PadEndOp): class Concatenate(cde.ConcatenateOp): """ - Tensor operation to prepend and append to a tensor. + Tensor operation that concatenates all columns into a single tensor. Args: axis (int, optional): axis to concatenate the tensors along (Default=0). diff --git a/tests/ut/python/dataset/test_from_dataset.py b/tests/ut/python/dataset/test_from_dataset.py index 514276fe704..94a5a5df02d 100644 --- a/tests/ut/python/dataset/test_from_dataset.py +++ b/tests/ut/python/dataset/test_from_dataset.py @@ -26,7 +26,7 @@ def test_demo_basic_from_dataset(): vocab = text.Vocab.from_dataset(data, "text", freq_range=None, top_k=None, special_tokens=["", ""], special_first=True) - data = data.map(input_columns=["text"], operations=text.Lookup(vocab)) + data = data.map(input_columns=["text"], operations=text.Lookup(vocab, "")) res = [] for d in data.create_dict_iterator(): res.append(d["text"].item()) @@ -39,7 +39,7 @@ def test_demo_basic_from_dataset_with_tokenizer(): data = data.map(input_columns=["text"], operations=text.UnicodeCharTokenizer()) vocab = text.Vocab.from_dataset(data, None, freq_range=None, top_k=None, special_tokens=["", ""], special_first=True) - data = data.map(input_columns=["text"], operations=text.Lookup(vocab)) + data = data.map(input_columns=["text"], operations=text.Lookup(vocab, "")) res = [] for d in data.create_dict_iterator(): res.append(list(d["text"])) @@ -60,7 +60,7 @@ def test_from_dataset(): corpus_dataset = ds.GeneratorDataset(gen_corpus, column_names=["text"]) vocab = text.Vocab.from_dataset(corpus_dataset, None, freq_range, top_k, special_tokens=["", ""], special_first=True) - corpus_dataset = corpus_dataset.map(input_columns="text", operations=text.Lookup(vocab)) + corpus_dataset = corpus_dataset.map(input_columns="text", operations=text.Lookup(vocab, "")) res = [] for d in corpus_dataset.create_dict_iterator(): res.append(list(d["text"])) @@ -108,7 +108,7 @@ def test_from_dataset_special_token(): corpus_dataset = ds.GeneratorDataset(gen_corpus, column_names=["text"]) vocab = text.Vocab.from_dataset(corpus_dataset, None, None, top_k, special_tokens, special_first) data = ds.GeneratorDataset(gen_input(texts), column_names=["text"]) - data = data.map(input_columns="text", operations=text.Lookup(vocab)) + data = data.map(input_columns="text", operations=text.Lookup(vocab, "")) res = [] for d in data.create_dict_iterator(): res.append(d["text"].item()) diff --git a/tests/ut/python/dataset/test_nlp.py b/tests/ut/python/dataset/test_nlp.py index 6b44cfc80bb..0678316f7bf 100644 --- a/tests/ut/python/dataset/test_nlp.py +++ b/tests/ut/python/dataset/test_nlp.py @@ -34,7 +34,7 @@ def test_on_tokenized_line(): jieba_op.add_word(word) data = data.map(input_columns=["text"], operations=jieba_op) vocab = text.Vocab.from_file(VOCAB_FILE, ",", special_tokens=["", ""]) - lookup = text.Lookup(vocab) + lookup = text.Lookup(vocab, "") data = data.map(input_columns=["text"], operations=lookup) res = np.array([[10, 1, 11, 1, 12, 1, 15, 1, 13, 1, 14], [11, 1, 12, 1, 10, 1, 14, 1, 13, 1, 15]], dtype=np.int32) diff --git a/tests/ut/python/dataset/test_vocab.py b/tests/ut/python/dataset/test_vocab.py index 35411e5c80e..901a822d5e4 100644 --- a/tests/ut/python/dataset/test_vocab.py +++ b/tests/ut/python/dataset/test_vocab.py @@ -26,7 +26,7 @@ SIMPLE_VOCAB_FILE = "../data/dataset/testVocab/simple_vocab_list.txt" def test_from_list_tutorial(): vocab = text.Vocab.from_list("home IS behind the world ahead !".split(" "), ["", ""], True) - lookup = text.Lookup(vocab) + lookup = text.Lookup(vocab, "") data = ds.TextFileDataset(DATA_FILE, shuffle=False) data = data.map(input_columns=["text"], operations=lookup) ind = 0 @@ -50,7 +50,7 @@ def test_from_file_tutorial(): def test_from_dict_tutorial(): vocab = text.Vocab.from_dict({"home": 3, "behind": 2, "the": 4, "world": 5, "": 6}) - lookup = text.Lookup(vocab, 6) # default value is -1 + lookup = text.Lookup(vocab, "") # any unknown token will be mapped to the id of data = ds.TextFileDataset(DATA_FILE, shuffle=False) data = data.map(input_columns=["text"], operations=lookup) res = [3, 6, 2, 4, 5, 6] @@ -65,28 +65,39 @@ def test_from_list(): for word in texts.split(" "): yield (np.array(word, dtype='S'),) - def test_config(lookup_str, vocab_input, special_tokens, special_first): + def test_config(lookup_str, vocab_input, special_tokens, special_first, unknown_token): try: vocab = text.Vocab.from_list(vocab_input, special_tokens, special_first) data = ds.GeneratorDataset(gen(lookup_str), column_names=["text"]) - data = data.map(input_columns=["text"], operations=text.Lookup(vocab)) + data = data.map(input_columns=["text"], operations=text.Lookup(vocab, unknown_token)) res = [] for d in data.create_dict_iterator(): res.append(d["text"].item()) return res except ValueError as e: return str(e) + except RuntimeError as e: + return str(e) + except TypeError as e: + return str(e) # test normal operations - assert test_config("w1 w2 w3 s1 s2", ["w1", "w2", "w3"], ["s1", "s2"], True) == [2, 3, 4, 0, 1] - assert test_config("w1 w2 w3 s1 s2", ["w1", "w2", "w3"], ["s1", "s2"], False) == [0, 1, 2, 3, 4] - assert test_config("w3 w2 w1", ["w1", "w2", "w3"], None, True) == [2, 1, 0] - assert test_config("w3 w2 w1", ["w1", "w2", "w3"], None, False) == [2, 1, 0] + assert test_config("w1 w2 w3 s1 s2 ephemeral", ["w1", "w2", "w3"], ["s1", "s2"], True, "s2") == [2, 3, 4, 0, 1, 1] + assert test_config("w1 w2 w3 s1 s2", ["w1", "w2", "w3"], ["s1", "s2"], False, "s2") == [0, 1, 2, 3, 4] + assert test_config("w3 w2 w1", ["w1", "w2", "w3"], None, True, "w1") == [2, 1, 0] + assert test_config("w3 w2 w1", ["w1", "w2", "w3"], None, False, "w1") == [2, 1, 0] + # test unknown token lookup + assert test_config("w1 un1 w3 un2", ["w1", "w2", "w3"], ["", ""], True, "") == [2, 1, 4, 1] + assert test_config("w1 un1 w3 un2", ["w1", "w2", "w3"], ["", ""], False, "") == [0, 4, 2, 4] # test exceptions - assert "word_list contains duplicate" in test_config("w1", ["w1", "w1"], [], True) - assert "special_tokens contains duplicate" in test_config("w1", ["w1", "w2"], ["s1", "s1"], True) - assert "special_tokens and word_list contain duplicate" in test_config("w1", ["w1", "w2"], ["s1", "w1"], True) + assert "doesn't exist in vocab." in test_config("un1", ["w1"], [], False, "unk") + assert "doesn't exist in vocab and no unknown token is specified." in test_config("un1", ["w1"], [], False, None) + assert "doesn't exist in vocab" in test_config("un1", ["w1"], [], False, None) + assert "word_list contains duplicate" in test_config("w1", ["w1", "w1"], [], True, "w1") + assert "special_tokens contains duplicate" in test_config("w1", ["w1", "w2"], ["s1", "s1"], True, "w1") + assert "special_tokens and word_list contain duplicate" in test_config("w1", ["w1", "w2"], ["s1", "w1"], True, "w1") + assert "is not of type" in test_config("w1", ["w1", "w2"], ["s1"], True, 123) def test_from_file(): @@ -99,7 +110,7 @@ def test_from_file(): vocab = text.Vocab.from_file(SIMPLE_VOCAB_FILE, vocab_size=vocab_size, special_tokens=special_tokens, special_first=special_first) data = ds.GeneratorDataset(gen(lookup_str), column_names=["text"]) - data = data.map(input_columns=["text"], operations=text.Lookup(vocab)) + data = data.map(input_columns=["text"], operations=text.Lookup(vocab, "s2")) res = [] for d in data.create_dict_iterator(): res.append(d["text"].item())