forked from mindspore-Ecosystem/mindspore
Add text C++ API
This commit is contained in:
parent
1321483749
commit
4e56618d18
|
@ -15,9 +15,16 @@
|
|||
*/
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
#include "minddata/dataset/include/text.h"
|
||||
#include "minddata/dataset/text/kernels/jieba_tokenizer_op.h"
|
||||
#include "minddata/dataset/text/kernels/lookup_op.h"
|
||||
#include "minddata/dataset/text/kernels/ngram_op.h"
|
||||
#include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
|
||||
#include "minddata/dataset/text/kernels/sliding_window_op.h"
|
||||
#ifndef _WIN32
|
||||
#include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h"
|
||||
#endif
|
||||
#include "minddata/dataset/util/path.h"
|
||||
|
||||
namespace mindspore {
|
||||
|
@ -29,6 +36,13 @@ namespace text {
|
|||
// FUNCTIONS TO CREATE TEXT OPERATIONS
|
||||
// (In alphabetical order)
|
||||
|
||||
std::shared_ptr<JiebaTokenizerOperation> JiebaTokenizer(const std::string &hmm_path, const std::string &mp_path,
|
||||
const JiebaMode &mode, bool with_offsets) {
|
||||
auto op = std::make_shared<JiebaTokenizerOperation>(hmm_path, mp_path, mode, with_offsets);
|
||||
|
||||
return op->ValidateParams() ? op : nullptr;
|
||||
}
|
||||
|
||||
std::shared_ptr<LookupOperation> Lookup(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
|
||||
const DataType &data_type) {
|
||||
auto op = std::make_shared<LookupOperation>(vocab, unknown_token, data_type);
|
||||
|
@ -36,6 +50,14 @@ std::shared_ptr<LookupOperation> Lookup(const std::shared_ptr<Vocab> &vocab, con
|
|||
return op->ValidateParams() ? op : nullptr;
|
||||
}
|
||||
|
||||
std::shared_ptr<NgramOperation> Ngram(const std::vector<int32_t> &ngrams,
|
||||
const std::pair<std::string, int32_t> &left_pad,
|
||||
const std::pair<std::string, int32_t> &right_pad, const std::string &separator) {
|
||||
auto op = std::make_shared<NgramOperation>(ngrams, left_pad, right_pad, separator);
|
||||
|
||||
return op->ValidateParams() ? op : nullptr;
|
||||
}
|
||||
|
||||
std::shared_ptr<SentencePieceTokenizerOperation> SentencePieceTokenizer(
|
||||
const std::shared_ptr<SentencePieceVocab> &vocab, SPieceTokenizerOutType out_type) {
|
||||
auto op = std::make_shared<SentencePieceTokenizerOperation>(vocab, out_type);
|
||||
|
@ -50,12 +72,79 @@ std::shared_ptr<SentencePieceTokenizerOperation> SentencePieceTokenizer(const st
|
|||
return op->ValidateParams() ? op : nullptr;
|
||||
}
|
||||
|
||||
std::shared_ptr<SlidingWindowOperation> SlidingWindow(const int32_t width, const int32_t axis) {
|
||||
auto op = std::make_shared<SlidingWindowOperation>(width, axis);
|
||||
|
||||
return op->ValidateParams() ? op : nullptr;
|
||||
}
|
||||
|
||||
#ifndef _WIN32
|
||||
std::shared_ptr<WhitespaceTokenizerOperation> WhitespaceTokenizer(bool with_offsets) {
|
||||
auto op = std::make_shared<WhitespaceTokenizerOperation>(with_offsets);
|
||||
|
||||
return op->ValidateParams() ? op : nullptr;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* ####################################### Validator Functions ############################################ */
|
||||
|
||||
// Helper function to validate tokenizer directory parameter
|
||||
Status ValidateTokenizerDirParam(const std::string &tokenizer_name, const std::string &tokenizer_file) {
|
||||
if (tokenizer_file.empty()) {
|
||||
std::string err_msg = tokenizer_name + ": tokenizer_file is not specified.";
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
|
||||
Path file(tokenizer_file);
|
||||
if (!file.Exists()) {
|
||||
std::string err_msg = tokenizer_name + ": tokenizer_file: [" + tokenizer_file + "] is an invalid directory path.";
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
|
||||
if (access(tokenizer_file.c_str(), R_OK) == -1) {
|
||||
std::string err_msg = tokenizer_name + ": No access to specified tokenizer path: " + tokenizer_file;
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/* ####################################### Derived TensorOperation classes ################################# */
|
||||
|
||||
// (In alphabetical order)
|
||||
|
||||
// JiebaTokenizerOperation
|
||||
JiebaTokenizerOperation::JiebaTokenizerOperation(const std::string &hmm_path, const std::string &mp_path,
|
||||
const JiebaMode &mode, bool with_offsets)
|
||||
: hmm_path_(hmm_path), mp_path_(mp_path), mode_(mode), with_offsets_(with_offsets) {}
|
||||
|
||||
Status JiebaTokenizerOperation::ValidateParams() {
|
||||
if (hmm_path_.empty()) {
|
||||
std::string err_msg = "JiebaTokenizer: The dict of HMMSegment in cppjieba is not provided.";
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
|
||||
if (mp_path_.empty()) {
|
||||
std::string err_msg = "JiebaTokenizer: The dict of MPSegment in cppjieba is not provided.";
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
|
||||
RETURN_IF_NOT_OK(ValidateTokenizerDirParam("JiebaTokenizer", hmm_path_));
|
||||
RETURN_IF_NOT_OK(ValidateTokenizerDirParam("JiebaTokenizer", mp_path_));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
std::shared_ptr<TensorOp> JiebaTokenizerOperation::Build() {
|
||||
std::shared_ptr<JiebaTokenizerOp> tensor_op =
|
||||
std::make_shared<JiebaTokenizerOp>(hmm_path_, mp_path_, mode_, with_offsets_);
|
||||
return tensor_op;
|
||||
}
|
||||
|
||||
// LookupOperation
|
||||
LookupOperation::LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
|
||||
const DataType &data_type)
|
||||
|
@ -83,6 +172,54 @@ std::shared_ptr<TensorOp> LookupOperation::Build() {
|
|||
return tensor_op;
|
||||
}
|
||||
|
||||
// NgramOperation
|
||||
NgramOperation::NgramOperation(const std::vector<int32_t> &ngrams, const std::pair<std::string, int32_t> &left_pad,
|
||||
const std::pair<std::string, int32_t> &right_pad, const std::string &separator)
|
||||
: ngrams_(ngrams), left_pad_(left_pad), right_pad_(right_pad), separator_(separator) {}
|
||||
|
||||
Status NgramOperation::ValidateParams() {
|
||||
if (ngrams_.size() == 0) {
|
||||
std::string err_msg = "Ngram : Container cannot be empty.";
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
} else {
|
||||
for (int32_t i = 0; i < ngrams_.size(); ++i) {
|
||||
if (ngrams_[i] <= 0) {
|
||||
std::string err_msg =
|
||||
"Ngram : The value of ngrams vector must be greater than 0: " + std::to_string(ngrams_[i]);
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (left_pad_.second < 0) {
|
||||
std::string err_msg =
|
||||
"Ngram : The second parameter pad_width in left_pad vector must be greater than or equal to 0: " +
|
||||
std::to_string(left_pad_.second);
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
|
||||
if (right_pad_.second < 0) {
|
||||
std::string err_msg =
|
||||
"Ngram : The second parameter pad_width in right_pad vector must be greater than or equal to 0: " +
|
||||
std::to_string(right_pad_.second);
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
std::shared_ptr<TensorOp> NgramOperation::Build() {
|
||||
int32_t l_len = left_pad_.second;
|
||||
int32_t r_len = right_pad_.second;
|
||||
std::string l_pad = left_pad_.first;
|
||||
std::string r_pad = right_pad_.first;
|
||||
std::shared_ptr<NgramOp> tensor_op = std::make_shared<NgramOp>(ngrams_, l_len, r_len, l_pad, r_pad, separator_);
|
||||
return tensor_op;
|
||||
}
|
||||
|
||||
// SentencePieceTokenizerOperation
|
||||
SentencePieceTokenizerOperation::SentencePieceTokenizerOperation(const std::shared_ptr<SentencePieceVocab> &vocab,
|
||||
SPieceTokenizerOutType out_type)
|
||||
|
@ -128,6 +265,36 @@ std::shared_ptr<TensorOp> SentencePieceTokenizerOperation::Build() {
|
|||
return tensor_op;
|
||||
}
|
||||
|
||||
// SlidingWindowOperation
|
||||
SlidingWindowOperation::SlidingWindowOperation(const int32_t width, const int32_t axis) : width_(width), axis_(axis) {}
|
||||
|
||||
Status SlidingWindowOperation::ValidateParams() {
|
||||
if (width_ < 1) {
|
||||
std::string err_msg =
|
||||
"SlidingWindow : The parameter width must be greater than or equal to 1: " + std::to_string(width_);
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
std::shared_ptr<TensorOp> SlidingWindowOperation::Build() {
|
||||
std::shared_ptr<SlidingWindowOp> tensor_op = std::make_shared<SlidingWindowOp>(static_cast<uint32_t>(width_), axis_);
|
||||
return tensor_op;
|
||||
}
|
||||
|
||||
#ifndef _WIN32
|
||||
// WhitespaceTokenizerOperation
|
||||
WhitespaceTokenizerOperation::WhitespaceTokenizerOperation(bool with_offsets) : with_offsets_(with_offsets) {}
|
||||
|
||||
Status WhitespaceTokenizerOperation::ValidateParams() { return Status::OK(); }
|
||||
|
||||
std::shared_ptr<TensorOp> WhitespaceTokenizerOperation::Build() {
|
||||
std::shared_ptr<WhitespaceTokenizerOp> tensor_op = std::make_shared<WhitespaceTokenizerOp>(with_offsets_);
|
||||
return tensor_op;
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace text
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -50,6 +50,15 @@ enum class ImageFormat { HWC = 0, CHW = 1, HW = 2 };
|
|||
// Possible interpolation modes
|
||||
enum class InterpolationMode { kLinear = 0, kNearestNeighbour = 1, kCubic = 2, kArea = 3 };
|
||||
|
||||
// Possible JiebaMode modes
|
||||
enum class JiebaMode { kMix = 0, kMp = 1, kHmm = 2 };
|
||||
|
||||
// Possible values for SPieceTokenizerOutType
|
||||
enum class SPieceTokenizerOutType { kString = 0, kInt = 1 };
|
||||
|
||||
// Possible values for SPieceTokenizerLoadType
|
||||
enum class SPieceTokenizerLoadType { kFile = 0, kModel = 1 };
|
||||
|
||||
// convenience functions for 32bit int bitmask
|
||||
inline bool BitTest(uint32_t bits, uint32_t bitMask) { return (bits & bitMask) == bitMask; }
|
||||
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "mindspore/ccsrc/minddata/dataset/core/data_type.h"
|
||||
|
@ -37,8 +38,29 @@ namespace dataset {
|
|||
namespace text {
|
||||
|
||||
// Text Op classes (in alphabetical order)
|
||||
class JiebaTokenizerOperation;
|
||||
class LookupOperation;
|
||||
class NgramOperation;
|
||||
class SentencePieceTokenizerOperation;
|
||||
class SlidingWindowOperation;
|
||||
#ifndef _WIN32
|
||||
class WhitespaceTokenizerOperation;
|
||||
#endif
|
||||
|
||||
/// \brief Tokenize Chinese string into words based on dictionary.
|
||||
/// \param[in] hmm_path Dictionary file is used by HMMSegment algorithm. The dictionary can be obtained on the
|
||||
/// official website of cppjieba.
|
||||
/// \param[in] mp_path Dictionary file is used by MPSegment algorithm. The dictionary can be obtained on the
|
||||
/// official website of cppjieba.
|
||||
/// \param[in] mode Valid values can be any of [JiebaMode.MP, JiebaMode.HMM, JiebaMode.MIX](default=JiebaMode.MIX).
|
||||
/// - JiebaMode.MP, tokenize with MPSegment algorithm.
|
||||
/// - JiebaMode.HMM, tokenize with Hiddel Markov Model Segment algorithm.
|
||||
/// - JiebaMode.MIX, tokenize with a mix of MPSegment and HMMSegment algorithm.
|
||||
/// \param[in] with_offsets If or not output offsets of tokens (default=false).
|
||||
/// \return Shared pointer to the current TensorOperation.
|
||||
std::shared_ptr<JiebaTokenizerOperation> JiebaTokenizer(const std::string &hmm_path, const std::string &mp_path,
|
||||
const JiebaMode &mode = JiebaMode::kMix,
|
||||
bool with_offsets = false);
|
||||
|
||||
/// \brief Lookup operator that looks up a word to an id.
|
||||
/// \param[in] vocab a Vocab object.
|
||||
|
@ -49,6 +71,21 @@ class SentencePieceTokenizerOperation;
|
|||
std::shared_ptr<LookupOperation> Lookup(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
|
||||
const mindspore::dataset::DataType &data_type = DataType("int32"));
|
||||
|
||||
/// \brief TensorOp to generate n-gram from a 1-D string Tensor.
|
||||
/// \param[in] ngrams ngrams is a vector of positive integers. For example, if ngrams={4, 3}, then the result
|
||||
/// would be a 4-gram followed by a 3-gram in the same tensor. If the number of words is not enough to make up
|
||||
/// for a n-gram, an empty string will be returned.
|
||||
/// \param[in] left_pad {"pad_token", pad_width}. Padding performed on left side of the sequence. pad_width will
|
||||
/// be capped at n-1. left_pad=("_",2) would pad left side of the sequence with "__" (default={"", 0}}).
|
||||
/// \param[in] right_pad {"pad_token", pad_width}. Padding performed on right side of the sequence.pad_width will
|
||||
/// be capped at n-1. right_pad=("-":2) would pad right side of the sequence with "--" (default={"", 0}}).
|
||||
/// \param[in] separator Symbol used to join strings together (default=" ").
|
||||
/// \return Shared pointer to the current TensorOperation.
|
||||
std::shared_ptr<NgramOperation> Ngram(const std::vector<int32_t> &ngrams,
|
||||
const std::pair<std::string, int32_t> &left_pad = {"", 0},
|
||||
const std::pair<std::string, int32_t> &right_pad = {"", 0},
|
||||
const std::string &separator = " ");
|
||||
|
||||
/// \brief Tokenize scalar token or 1-D tokens to tokens by sentencepiece.
|
||||
/// \param[in] vocab a SentencePieceVocab object.
|
||||
/// \param[in] out_type The type of output.
|
||||
|
@ -63,8 +100,41 @@ std::shared_ptr<SentencePieceTokenizerOperation> SentencePieceTokenizer(
|
|||
std::shared_ptr<SentencePieceTokenizerOperation> SentencePieceTokenizer(
|
||||
const std::string &vocab_path, mindspore::dataset::SPieceTokenizerOutType out_type);
|
||||
|
||||
/// \brief TensorOp to construct a tensor from data (only 1-D for now), where each element in the dimension
|
||||
/// axis is a slice of data starting at the corresponding position, with a specified width.
|
||||
/// \param[in] width The width of the window. It must be an integer and greater than zero.
|
||||
/// \param[in] axis The axis along which the sliding window is computed (default=0), axis support 0 or -1 only
|
||||
/// for now.
|
||||
/// \return Shared pointer to the current TensorOperation.
|
||||
std::shared_ptr<SlidingWindowOperation> SlidingWindow(const int32_t width, const int32_t axis = 0);
|
||||
|
||||
#ifndef _WIN32
|
||||
/// \brief Tokenize a scalar tensor of UTF-8 string on ICU4C defined whitespaces
|
||||
/// \param[in] with_offsets If or not output offsets of tokens (default=false).
|
||||
/// \return Shared pointer to the current TensorOperation.
|
||||
std::shared_ptr<WhitespaceTokenizerOperation> WhitespaceTokenizer(bool with_offsets = false);
|
||||
#endif
|
||||
|
||||
/* ####################################### Derived TensorOperation classes ################################# */
|
||||
|
||||
class JiebaTokenizerOperation : public TensorOperation {
|
||||
public:
|
||||
explicit JiebaTokenizerOperation(const std::string &hmm_path, const std::string &mp_path, const JiebaMode &mode,
|
||||
bool with_offsets);
|
||||
|
||||
~JiebaTokenizerOperation() = default;
|
||||
|
||||
std::shared_ptr<TensorOp> Build() override;
|
||||
|
||||
Status ValidateParams() override;
|
||||
|
||||
private:
|
||||
std::string hmm_path_;
|
||||
std::string mp_path_;
|
||||
JiebaMode mode_;
|
||||
bool with_offsets_;
|
||||
};
|
||||
|
||||
class LookupOperation : public TensorOperation {
|
||||
public:
|
||||
explicit LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
|
||||
|
@ -83,6 +153,24 @@ class LookupOperation : public TensorOperation {
|
|||
DataType data_type_;
|
||||
};
|
||||
|
||||
class NgramOperation : public TensorOperation {
|
||||
public:
|
||||
explicit NgramOperation(const std::vector<int32_t> &ngrams, const std::pair<std::string, int32_t> &left_pad,
|
||||
const std::pair<std::string, int32_t> &right_pad, const std::string &separator);
|
||||
|
||||
~NgramOperation() = default;
|
||||
|
||||
std::shared_ptr<TensorOp> Build() override;
|
||||
|
||||
Status ValidateParams() override;
|
||||
|
||||
private:
|
||||
std::vector<int32_t> ngrams_;
|
||||
std::pair<std::string, int32_t> left_pad_;
|
||||
std::pair<std::string, int32_t> right_pad_;
|
||||
std::string separator_;
|
||||
};
|
||||
|
||||
class SentencePieceTokenizerOperation : public TensorOperation {
|
||||
public:
|
||||
SentencePieceTokenizerOperation(const std::shared_ptr<SentencePieceVocab> &vocab, SPieceTokenizerOutType out_type);
|
||||
|
@ -101,6 +189,37 @@ class SentencePieceTokenizerOperation : public TensorOperation {
|
|||
SPieceTokenizerLoadType load_type_;
|
||||
SPieceTokenizerOutType out_type_;
|
||||
};
|
||||
|
||||
class SlidingWindowOperation : public TensorOperation {
|
||||
public:
|
||||
explicit SlidingWindowOperation(const int32_t width, const int32_t axis);
|
||||
|
||||
~SlidingWindowOperation() = default;
|
||||
|
||||
std::shared_ptr<TensorOp> Build() override;
|
||||
|
||||
Status ValidateParams() override;
|
||||
|
||||
private:
|
||||
int32_t width_;
|
||||
int32_t axis_;
|
||||
};
|
||||
|
||||
#ifndef _WIN32
|
||||
class WhitespaceTokenizerOperation : public TensorOperation {
|
||||
public:
|
||||
explicit WhitespaceTokenizerOperation(bool with_offsets);
|
||||
|
||||
~WhitespaceTokenizerOperation() = default;
|
||||
|
||||
std::shared_ptr<TensorOp> Build() override;
|
||||
|
||||
Status ValidateParams() override;
|
||||
|
||||
private:
|
||||
bool with_offsets_;
|
||||
};
|
||||
#endif
|
||||
} // namespace text
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -20,14 +20,13 @@
|
|||
#include <memory>
|
||||
|
||||
#include "cppjieba/Jieba.hpp"
|
||||
#include "minddata/dataset/core/constants.h"
|
||||
#include "minddata/dataset/kernels/tensor_op.h"
|
||||
#include "minddata/dataset/util/status.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
enum class JiebaMode { kMix = 0, kMp = 1, kHmm = 2 };
|
||||
|
||||
class JiebaTokenizerOp : public TensorOp {
|
||||
public:
|
||||
// default constant for Jieba MPSegment algorithm.
|
||||
|
|
|
@ -23,14 +23,13 @@
|
|||
#include <iostream>
|
||||
#include <memory>
|
||||
|
||||
#include "minddata/dataset/core/constants.h"
|
||||
#include "minddata/dataset/kernels/tensor_op.h"
|
||||
#include "minddata/dataset/util/status.h"
|
||||
#include "minddata/dataset/text/sentence_piece_vocab.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
enum class SPieceTokenizerOutType { kString = 0, kInt = 1 };
|
||||
enum class SPieceTokenizerLoadType { kFile = 0, kModel = 1 };
|
||||
|
||||
class SentencePieceTokenizerOp : public TensorOp {
|
||||
public:
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#include <string>
|
||||
|
||||
#include "common/common.h"
|
||||
#include "minddata/dataset/core/constants.h"
|
||||
#include "minddata/dataset/include/datasets.h"
|
||||
#include "minddata/dataset/include/status.h"
|
||||
#include "minddata/dataset/include/transforms.h"
|
||||
|
|
|
@ -0,0 +1,567 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include "common/common.h"
|
||||
#include "minddata/dataset/include/datasets.h"
|
||||
#include "minddata/dataset/include/status.h"
|
||||
#include "minddata/dataset/include/transforms.h"
|
||||
#include "minddata/dataset/include/text.h"
|
||||
|
||||
using namespace mindspore::dataset;
|
||||
using mindspore::dataset::DataType;
|
||||
using mindspore::dataset::ShuffleMode;
|
||||
using mindspore::dataset::Status;
|
||||
using mindspore::dataset::Tensor;
|
||||
using mindspore::dataset::Vocab;
|
||||
|
||||
class MindDataTestPipeline : public UT::DatasetOpTesting {
|
||||
protected:
|
||||
};
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess) {
|
||||
// Testing the parameter of JiebaTokenizer interface when the mode is JiebaMode::kMp and the with_offsets is false.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerSuccess.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
|
||||
std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
|
||||
std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create jieba_tokenizer operation on ds
|
||||
std::shared_ptr<TensorOperation> jieba_tokenizer = text::JiebaTokenizer(hmm_path, mp_path, JiebaMode::kMp);
|
||||
EXPECT_NE(jieba_tokenizer, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({jieba_tokenizer}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
std::vector<std::string> expected = {"今天天气", "太好了", "我们", "一起", "去", "外面", "玩吧"};
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
std::shared_ptr<Tensor> expected_tensor;
|
||||
Tensor::CreateFromVector(expected, &expected_tensor);
|
||||
EXPECT_EQ(*ind, *expected_tensor);
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 1);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess1) {
|
||||
// Testing the parameter of JiebaTokenizer interface when the mode is JiebaMode::kHmm and the with_offsets is false.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerSuccess1.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
|
||||
std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
|
||||
std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create jieba_tokenizer operation on ds
|
||||
std::shared_ptr<TensorOperation> jieba_tokenizer = text::JiebaTokenizer(hmm_path, mp_path, JiebaMode::kHmm);
|
||||
EXPECT_NE(jieba_tokenizer, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({jieba_tokenizer}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
std::vector<std::string> expected = {"今天", "天气", "太", "好", "了", "我们", "一起", "去", "外面", "玩", "吧"};
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
std::shared_ptr<Tensor> expected_tensor;
|
||||
Tensor::CreateFromVector(expected, &expected_tensor);
|
||||
EXPECT_EQ(*ind, *expected_tensor);
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 1);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess2) {
|
||||
// Testing the parameter of JiebaTokenizer interface when the mode is JiebaMode::kMp and the with_offsets is true.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerSuccess2.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
|
||||
std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
|
||||
std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create jieba_tokenizer operation on ds
|
||||
std::shared_ptr<TensorOperation> jieba_tokenizer = text::JiebaTokenizer(hmm_path, mp_path, JiebaMode::kMp, true);
|
||||
EXPECT_NE(jieba_tokenizer, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({jieba_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"},
|
||||
{"token", "offsets_start", "offsets_limit"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
std::vector<std::string> expected = {"今天天气", "太好了", "我们", "一起", "去", "外面", "玩吧"};
|
||||
|
||||
std::vector<uint32_t> expected_offsets_start = {0, 12, 21, 27, 33, 36, 42};
|
||||
std::vector<uint32_t> expected_offsets_limit = {12, 21, 27, 33, 36, 42, 48};
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["offsets_start"];
|
||||
auto ind1 = row["offsets_limit"];
|
||||
auto token = row["token"];
|
||||
std::shared_ptr<Tensor> expected_tensor;
|
||||
std::shared_ptr<Tensor> expected_tensor_offsets_start;
|
||||
std::shared_ptr<Tensor> expected_tensor_offsets_limit;
|
||||
Tensor::CreateFromVector(expected, &expected_tensor);
|
||||
Tensor::CreateFromVector(expected_offsets_start, &expected_tensor_offsets_start);
|
||||
Tensor::CreateFromVector(expected_offsets_limit, &expected_tensor_offsets_limit);
|
||||
EXPECT_EQ(*ind, *expected_tensor_offsets_start);
|
||||
EXPECT_EQ(*ind1, *expected_tensor_offsets_limit);
|
||||
EXPECT_EQ(*token, *expected_tensor);
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 1);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail) {
|
||||
// Testing the incorrect parameter of JiebaTokenizer interface.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerFail.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
|
||||
std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
|
||||
std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create jieba_tokenizer operation on ds
|
||||
// Testing the parameter hmm_path is empty
|
||||
std::shared_ptr<TensorOperation> jieba_tokenizer = text::JiebaTokenizer("", mp_path, JiebaMode::kMp);
|
||||
EXPECT_EQ(jieba_tokenizer, nullptr);
|
||||
// Testing the parameter mp_path is empty
|
||||
std::shared_ptr<TensorOperation> jieba_tokenizer1 = text::JiebaTokenizer(hmm_path, "", JiebaMode::kMp);
|
||||
EXPECT_EQ(jieba_tokenizer1, nullptr);
|
||||
// Testing the parameter hmm_path is invalid path
|
||||
std::string hmm_path_invalid = datasets_root_path_ + "/jiebadict/1.txt";
|
||||
std::shared_ptr<TensorOperation> jieba_tokenizer2 = text::JiebaTokenizer(hmm_path_invalid, mp_path, JiebaMode::kMp);
|
||||
EXPECT_EQ(jieba_tokenizer2, nullptr);
|
||||
// Testing the parameter mp_path is invalid path
|
||||
std::string mp_path_invalid = datasets_root_path_ + "/jiebadict/1.txt";
|
||||
std::shared_ptr<TensorOperation> jieba_tokenizer3 = text::JiebaTokenizer(hmm_path, mp_path_invalid, JiebaMode::kMp);
|
||||
EXPECT_EQ(jieba_tokenizer3, nullptr);
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestSlidingWindowSuccess) {
|
||||
// Testing the parameter of SlidingWindow interface when the axis is 0.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowSuccess.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create white_tokenizer operation on ds
|
||||
std::shared_ptr<TensorOperation> white_tokenizer = text::WhitespaceTokenizer();
|
||||
EXPECT_NE(white_tokenizer, nullptr);
|
||||
// Create sliding_window operation on ds
|
||||
std::shared_ptr<TensorOperation> sliding_window = text::SlidingWindow(3, 0);
|
||||
EXPECT_NE(sliding_window, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({white_tokenizer, sliding_window}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
std::vector<std::vector<std::string>> expected = {{"This", "is", "a", "is", "a", "text", "a", "text", "file."},
|
||||
{"Be", "happy", "every", "happy", "every", "day."},
|
||||
{"Good", "luck", "to", "luck", "to", "everyone."}};
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
std::shared_ptr<Tensor> expected_tensor;
|
||||
int x = expected[i].size() / 3;
|
||||
Tensor::CreateFromVector(expected[i], TensorShape({x, 3}), &expected_tensor);
|
||||
EXPECT_EQ(*ind, *expected_tensor);
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 3);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestSlidingWindowSuccess1) {
|
||||
// Testing the parameter of SlidingWindow interface when the axis is -1.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowSuccess1.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create white_tokenizer operation on ds
|
||||
std::shared_ptr<TensorOperation> white_tokenizer = text::WhitespaceTokenizer();
|
||||
EXPECT_NE(white_tokenizer, nullptr);
|
||||
// Create sliding_window operation on ds
|
||||
std::shared_ptr<TensorOperation> sliding_window = text::SlidingWindow(2, -1);
|
||||
EXPECT_NE(sliding_window, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({white_tokenizer, sliding_window}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
std::vector<std::vector<std::string>> expected = {{"This", "is", "is", "a", "a", "text", "text", "file."},
|
||||
{"Be", "happy", "happy", "every", "every", "day."},
|
||||
{"Good", "luck", "luck", "to", "to", "everyone."}};
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
std::shared_ptr<Tensor> expected_tensor;
|
||||
int x = expected[i].size() / 2;
|
||||
Tensor::CreateFromVector(expected[i], TensorShape({x, 2}), &expected_tensor);
|
||||
EXPECT_EQ(*ind, *expected_tensor);
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 3);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestSlidingWindowFail) {
|
||||
// Testing the incorrect parameter of SlidingWindow interface.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowFail.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create sliding_window operation on ds
|
||||
// Testing the parameter width less than or equal to 0
|
||||
// The parameter axis support 0 or -1 only for now
|
||||
std::shared_ptr<TensorOperation> sliding_window = text::SlidingWindow(0, 0);
|
||||
EXPECT_EQ(sliding_window, nullptr);
|
||||
// Testing the parameter width less than or equal to 0
|
||||
// The parameter axis support 0 or -1 only for now
|
||||
std::shared_ptr<TensorOperation> sliding_window1 = text::SlidingWindow(-2, 0);
|
||||
EXPECT_EQ(sliding_window1, nullptr);
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestNgramSuccess) {
|
||||
// Testing the parameter of Ngram interface.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramSuccess.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create white_tokenizer operation on ds
|
||||
std::shared_ptr<TensorOperation> white_tokenizer = text::WhitespaceTokenizer();
|
||||
EXPECT_NE(white_tokenizer, nullptr);
|
||||
// Create sliding_window operation on ds
|
||||
std::shared_ptr<TensorOperation> ngram_op = text::Ngram({2}, {"_", 1}, {"_", 1}, " ");
|
||||
EXPECT_NE(ngram_op, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({white_tokenizer, ngram_op}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
std::vector<std::vector<std::string>> expected = {{"_ This", "This is", "is a", "a text", "text file.", "file. _"},
|
||||
{"_ Be", "Be happy", "happy every", "every day.", "day. _"},
|
||||
{"_ Good", "Good luck", "luck to", "to everyone.", "everyone. _"}};
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
std::shared_ptr<Tensor> expected_tensor;
|
||||
int x = expected[i].size();
|
||||
Tensor::CreateFromVector(expected[i], TensorShape({x}), &expected_tensor);
|
||||
EXPECT_EQ(*ind, *expected_tensor);
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 3);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestNgramSuccess1) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramSuccess1.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create white_tokenizer operation on ds
|
||||
std::shared_ptr<TensorOperation> white_tokenizer = text::WhitespaceTokenizer();
|
||||
EXPECT_NE(white_tokenizer, nullptr);
|
||||
// Create sliding_window operation on ds
|
||||
std::shared_ptr<TensorOperation> ngram_op = text::Ngram({2, 3}, {"&", 2}, {"&", 2}, "-");
|
||||
EXPECT_NE(ngram_op, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({white_tokenizer, ngram_op}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
std::vector<std::vector<std::string>> expected = {
|
||||
{"&-This", "This-is", "is-a", "a-text", "text-file.", "file.-&", "&-&-This", "&-This-is", "This-is-a", "is-a-text",
|
||||
"a-text-file.", "text-file.-&", "file.-&-&"},
|
||||
{"&-Be", "Be-happy", "happy-every", "every-day.", "day.-&", "&-&-Be", "&-Be-happy", "Be-happy-every",
|
||||
"happy-every-day.", "every-day.-&", "day.-&-&"},
|
||||
{"&-Good", "Good-luck", "luck-to", "to-everyone.", "everyone.-&", "&-&-Good", "&-Good-luck", "Good-luck-to",
|
||||
"luck-to-everyone.", "to-everyone.-&", "everyone.-&-&"}};
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
std::shared_ptr<Tensor> expected_tensor;
|
||||
int x = expected[i].size();
|
||||
Tensor::CreateFromVector(expected[i], TensorShape({x}), &expected_tensor);
|
||||
EXPECT_EQ(*ind, *expected_tensor);
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 3);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestNgramFail) {
|
||||
// Testing the incorrect parameter of Ngram interface.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create sliding_window operation on ds
|
||||
// Testing the vector of ngram is empty
|
||||
std::shared_ptr<TensorOperation> ngram_op = text::Ngram({});
|
||||
EXPECT_EQ(ngram_op, nullptr);
|
||||
// Testing the value of ngrams vector less than and equal to 0
|
||||
std::shared_ptr<TensorOperation> ngram_op1 = text::Ngram({0});
|
||||
EXPECT_EQ(ngram_op1, nullptr);
|
||||
// Testing the value of ngrams vector less than and equal to 0
|
||||
std::shared_ptr<TensorOperation> ngram_op2 = text::Ngram({-2});
|
||||
EXPECT_EQ(ngram_op2, nullptr);
|
||||
// Testing the second parameter pad_width in left_pad vector less than 0
|
||||
std::shared_ptr<TensorOperation> ngram_op3 = text::Ngram({2}, {"", -1});
|
||||
EXPECT_EQ(ngram_op3, nullptr);
|
||||
// Testing the second parameter pad_width in right_pad vector less than 0
|
||||
std::shared_ptr<TensorOperation> ngram_op4 = text::Ngram({2}, {"", 1}, {"", -1});
|
||||
EXPECT_EQ(ngram_op4, nullptr);
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestWhitespaceTokenizerSuccess) {
|
||||
// Testing the parameter of WhitespaceTokenizer interface when the with_offsets is default.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWhitespaceTokenizerSuccess.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create white_tokenizer operation on ds
|
||||
std::shared_ptr<TensorOperation> white_tokenizer = text::WhitespaceTokenizer();
|
||||
EXPECT_NE(white_tokenizer, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({white_tokenizer}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
std::vector<std::vector<std::string>> expected = {
|
||||
{"This", "is", "a", "text", "file."}, {"Be", "happy", "every", "day."}, {"Good", "luck", "to", "everyone."}};
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
std::shared_ptr<Tensor> expected_tensor;
|
||||
int x = expected[i].size();
|
||||
Tensor::CreateFromVector(expected[i], TensorShape({x}), &expected_tensor);
|
||||
EXPECT_EQ(*ind, *expected_tensor);
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 3);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestWhitespaceTokenizerSuccess1) {
|
||||
// Testing the parameter of WhitespaceTokenizer interface when the with_offsets is true.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWhitespaceTokenizerSuccess1.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create white_tokenizer operation on ds
|
||||
std::shared_ptr<TensorOperation> white_tokenizer = text::WhitespaceTokenizer(true);
|
||||
EXPECT_NE(white_tokenizer, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({white_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"},
|
||||
{"token", "offsets_start", "offsets_limit"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
|
||||
iter->GetNextRow(&row);
|
||||
|
||||
std::vector<std::vector<std::string>> expected = {
|
||||
{"Welcome", "to", "Beijing!"}, {"北京欢迎您!"}, {"我喜欢English!"}, {""}};
|
||||
|
||||
std::vector<std::vector<uint32_t>> expected_offsets_start = {{0, 8, 11}, {0}, {0}, {0}};
|
||||
std::vector<std::vector<uint32_t>> expected_offsets_limit = {{7, 10, 19}, {18}, {17}, {0}};
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["offsets_start"];
|
||||
auto ind1 = row["offsets_limit"];
|
||||
auto token = row["token"];
|
||||
std::shared_ptr<Tensor> expected_tensor;
|
||||
std::shared_ptr<Tensor> expected_tensor_offsets_start;
|
||||
std::shared_ptr<Tensor> expected_tensor_offsets_limit;
|
||||
int x = expected[i].size();
|
||||
Tensor::CreateFromVector(expected[i], TensorShape({x}), &expected_tensor);
|
||||
Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &expected_tensor_offsets_start);
|
||||
Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &expected_tensor_offsets_limit);
|
||||
EXPECT_EQ(*ind, *expected_tensor_offsets_start);
|
||||
EXPECT_EQ(*ind1, *expected_tensor_offsets_limit);
|
||||
EXPECT_EQ(*token, *expected_tensor);
|
||||
|
||||
iter->GetNextRow(&row);
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 4);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
Loading…
Reference in New Issue