forked from mindspore-Ecosystem/mindspore
dataset: C++ API ToNumber & Lookup: change data_type parm from string to DataType
This commit is contained in:
parent
ade65bac93
commit
636809c255
|
@ -19,7 +19,8 @@
|
|||
#include <regex>
|
||||
|
||||
#include "minddata/dataset/include/text.h"
|
||||
|
||||
#include "mindspore/core/ir/dtype/type_id.h"
|
||||
#include "minddata/dataset/core/type_id.h"
|
||||
#include "minddata/dataset/text/ir/kernels/text_ir.h"
|
||||
|
||||
namespace mindspore {
|
||||
|
@ -203,16 +204,20 @@ Status JiebaTokenizer::ParserFile(const std::string &file_path,
|
|||
// Lookup
|
||||
struct Lookup::Data {
|
||||
Data(const std::shared_ptr<Vocab> &vocab, const std::optional<std::vector<char>> &unknown_token,
|
||||
const std::vector<char> &data_type)
|
||||
: vocab_(vocab), unknown_token_(OptionalCharToString(unknown_token)), data_type_(CharToString(data_type)) {}
|
||||
mindspore::DataType data_type)
|
||||
: vocab_(vocab),
|
||||
unknown_token_(OptionalCharToString(unknown_token)),
|
||||
data_type_(dataset::MSTypeToDEType(static_cast<TypeId>(data_type))) {}
|
||||
std::shared_ptr<Vocab> vocab_;
|
||||
std::optional<std::string> unknown_token_;
|
||||
std::string data_type_;
|
||||
dataset::DataType data_type_;
|
||||
};
|
||||
|
||||
Lookup::Lookup(const std::shared_ptr<Vocab> &vocab, const std::optional<std::vector<char>> &unknown_token,
|
||||
const std::vector<char> &data_type)
|
||||
: data_(std::make_shared<Data>(vocab, unknown_token, data_type)) {}
|
||||
mindspore::DataType data_type)
|
||||
: data_(std::make_shared<Data>(vocab, unknown_token, data_type)) {
|
||||
data_->data_type_ = dataset::MSTypeToDEType(static_cast<TypeId>(data_type));
|
||||
}
|
||||
|
||||
std::shared_ptr<TensorOperation> Lookup::Parse() {
|
||||
return std::make_shared<LookupOperation>(data_->vocab_, data_->unknown_token_, data_->data_type_);
|
||||
|
@ -331,11 +336,12 @@ std::shared_ptr<TensorOperation> SlidingWindow::Parse() {
|
|||
|
||||
// ToNumber
|
||||
struct ToNumber::Data {
|
||||
explicit Data(const std::vector<char> &data_type) : data_type_(CharToString(data_type)) {}
|
||||
std::string data_type_;
|
||||
dataset::DataType data_type_;
|
||||
};
|
||||
|
||||
ToNumber::ToNumber(const std::vector<char> &data_type) : data_(std::make_shared<Data>(data_type)) {}
|
||||
ToNumber::ToNumber(mindspore::DataType data_type) : data_(std::make_shared<Data>()) {
|
||||
data_->data_type_ = dataset::MSTypeToDEType(static_cast<TypeId>(data_type));
|
||||
}
|
||||
|
||||
std::shared_ptr<TensorOperation> ToNumber::Parse() { return std::make_shared<ToNumberOperation>(data_->data_type_); }
|
||||
|
||||
|
|
|
@ -207,13 +207,14 @@ class Lookup final : public TensorTransform {
|
|||
/// \param[in] unknown_token word to use for lookup if the word being looked up is out of Vocabulary (oov).
|
||||
/// If unknown_token is oov, runtime error will be thrown. If unknown_token is {}, which means that not to
|
||||
/// specify unknown_token when word being out of Vocabulary (default={}).
|
||||
/// \param[in] data_type type of the tensor after lookup, typically int32.
|
||||
/// \param[in] data_type mindspore::DataType of the tensor after lookup; must be numeric, including bool.
|
||||
/// (default=mindspore::DataType::kNumberTypeInt32).
|
||||
explicit Lookup(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token = {},
|
||||
const std::string &data_type = "int32")
|
||||
: Lookup(vocab, OptionalStringToChar(unknown_token), StringToChar(data_type)) {}
|
||||
mindspore::DataType data_type = mindspore::DataType::kNumberTypeInt32)
|
||||
: Lookup(vocab, OptionalStringToChar(unknown_token), data_type) {}
|
||||
|
||||
explicit Lookup(const std::shared_ptr<Vocab> &vocab, const std::optional<std::vector<char>> &unknown_token,
|
||||
const std::vector<char> &data_type);
|
||||
mindspore::DataType data_type = mindspore::DataType::kNumberTypeInt32);
|
||||
|
||||
/// \brief Destructor
|
||||
~Lookup() = default;
|
||||
|
@ -405,10 +406,8 @@ class SlidingWindow final : public TensorTransform {
|
|||
class ToNumber final : public TensorTransform {
|
||||
public:
|
||||
/// \brief Constructor.
|
||||
/// \param[in] data_type of the tensor to be cast to. Must be a numeric type.
|
||||
explicit ToNumber(const std::string &data_type) : ToNumber(StringToChar(data_type)) {}
|
||||
|
||||
explicit ToNumber(const std::vector<char> &data_type);
|
||||
/// \param[in] data_type mindspore::DataType of the tensor to be cast to. Must be a numeric type, excluding bool.
|
||||
explicit ToNumber(mindspore::DataType data_type);
|
||||
|
||||
/// \brief Destructor
|
||||
~ToNumber() = default;
|
||||
|
|
|
@ -42,6 +42,7 @@
|
|||
#include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h"
|
||||
#endif
|
||||
#include "minddata/dataset/core/data_type.h"
|
||||
#include "minddata/dataset/core/type_id.h"
|
||||
#include "minddata/dataset/util/path.h"
|
||||
|
||||
#include "minddata/dataset/text/ir/validators.h"
|
||||
|
@ -166,9 +167,19 @@ Status JiebaTokenizerOperation::AddWord(const std::string &word, int64_t freq) {
|
|||
}
|
||||
|
||||
// LookupOperation
|
||||
// DataType data_type - required for C++ API
|
||||
LookupOperation::LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token,
|
||||
DataType data_type)
|
||||
: vocab_(vocab), unknown_token_(unknown_token), default_id_(Vocab::kNoTokenExists), data_type_(data_type) {}
|
||||
|
||||
// std::string data_type - required for Pybind
|
||||
LookupOperation::LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token,
|
||||
const std::string &data_type)
|
||||
: vocab_(vocab), unknown_token_(unknown_token), default_id_(Vocab::kNoTokenExists), data_type_(data_type) {}
|
||||
: vocab_(vocab), unknown_token_(unknown_token), default_id_(Vocab::kNoTokenExists) {
|
||||
// Convert from string to DEType
|
||||
DataType temp_data_type(data_type);
|
||||
data_type_ = temp_data_type;
|
||||
}
|
||||
|
||||
LookupOperation::~LookupOperation() = default;
|
||||
|
||||
|
@ -187,8 +198,9 @@ Status LookupOperation::ValidateParams() {
|
|||
}
|
||||
}
|
||||
|
||||
if (!IsTypeNumeric(data_type_)) {
|
||||
std::string err_msg = "Lookup does not support a string to string mapping, data_type can only be numeric.";
|
||||
if (!data_type_.IsNumeric()) {
|
||||
// Note: For DEType, Bool is counted as numeric, and is a valid type for Lookup
|
||||
std::string err_msg = "Lookup : The parameter data_type must be numeric including bool.";
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
|
@ -351,11 +363,20 @@ std::shared_ptr<TensorOp> SlidingWindowOperation::Build() {
|
|||
}
|
||||
|
||||
// ToNumberOperation
|
||||
ToNumberOperation::ToNumberOperation(std::string data_type) : data_type_(data_type) {}
|
||||
// DataType data_type - required for C++ API
|
||||
ToNumberOperation::ToNumberOperation(DataType data_type) : data_type_(data_type) {}
|
||||
|
||||
// std::string data_type - required for Pybind
|
||||
ToNumberOperation::ToNumberOperation(std::string data_type) {
|
||||
// Convert from string to DEType
|
||||
DataType temp_data_type(data_type);
|
||||
data_type_ = temp_data_type;
|
||||
}
|
||||
|
||||
Status ToNumberOperation::ValidateParams() {
|
||||
if (!IsTypeNumeric(data_type_) || IsTypeBoolean(data_type_)) {
|
||||
std::string err_msg = "ToNumber : The parameter data_type must be a numeric type, got: " + data_type_;
|
||||
if (!data_type_.IsNumeric() || data_type_.IsBool()) {
|
||||
// Note: For DEType, Bool is counted as numeric, but is not a valid type for ToNumber.
|
||||
std::string err_msg = "ToNumber : The parameter data_type must be numeric and excludes bool.";
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
|
@ -368,6 +389,13 @@ std::shared_ptr<TensorOp> ToNumberOperation::Build() {
|
|||
return tensor_op;
|
||||
}
|
||||
|
||||
Status ToNumberOperation::to_json(nlohmann::json *out_json) {
|
||||
nlohmann::json args;
|
||||
args["data_type"] = data_type_.ToString();
|
||||
*out_json = args;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// TruncateSequencePairOperation
|
||||
TruncateSequencePairOperation::TruncateSequencePairOperation(int32_t max_length) : max_length_(max_length) {}
|
||||
|
||||
|
|
|
@ -142,7 +142,9 @@ class JiebaTokenizerOperation : public TensorOperation {
|
|||
class LookupOperation : public TensorOperation {
|
||||
public:
|
||||
explicit LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token,
|
||||
const std::string &data_type);
|
||||
DataType data_type); // Used for C++ API
|
||||
explicit LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token,
|
||||
const std::string &data_type); // Used for Pybind
|
||||
|
||||
~LookupOperation();
|
||||
|
||||
|
@ -156,7 +158,7 @@ class LookupOperation : public TensorOperation {
|
|||
std::shared_ptr<Vocab> vocab_;
|
||||
std::optional<std::string> unknown_token_;
|
||||
int32_t default_id_;
|
||||
std::string data_type_;
|
||||
DataType data_type_;
|
||||
};
|
||||
|
||||
class NgramOperation : public TensorOperation {
|
||||
|
@ -273,7 +275,8 @@ class SlidingWindowOperation : public TensorOperation {
|
|||
|
||||
class ToNumberOperation : public TensorOperation {
|
||||
public:
|
||||
explicit ToNumberOperation(std::string data_type);
|
||||
explicit ToNumberOperation(DataType data_type); // Used for C++ API
|
||||
explicit ToNumberOperation(std::string data_type); // Used for Pybind
|
||||
|
||||
~ToNumberOperation() = default;
|
||||
|
||||
|
@ -283,8 +286,10 @@ class ToNumberOperation : public TensorOperation {
|
|||
|
||||
std::string Name() const override { return kToNumberOperation; }
|
||||
|
||||
Status to_json(nlohmann::json *out_json) override;
|
||||
|
||||
private:
|
||||
std::string data_type_;
|
||||
DataType data_type_;
|
||||
};
|
||||
|
||||
class TruncateSequencePairOperation : public TensorOperation {
|
||||
|
|
|
@ -89,6 +89,9 @@ TEST_F(MindDataTestPipeline, TestSentencePieceVocabSuccess1) {
|
|||
}
|
||||
|
||||
EXPECT_EQ(i, 1);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestSentencePieceVocabSuccess2) {
|
||||
|
@ -149,6 +152,9 @@ TEST_F(MindDataTestPipeline, TestSentencePieceVocabSuccess2) {
|
|||
}
|
||||
|
||||
EXPECT_EQ(i, 1);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestSentencePieceVocabFail) {
|
||||
|
|
|
@ -1541,7 +1541,7 @@ TEST_F(MindDataTestPipeline, TestToNumberSuccess1) {
|
|||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create ToNumber operation on ds
|
||||
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>("int64");
|
||||
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt64);
|
||||
EXPECT_NE(to_number, nullptr);
|
||||
|
||||
// Create a Map operation on ds
|
||||
|
@ -1596,7 +1596,7 @@ TEST_F(MindDataTestPipeline, TestToNumberSuccess2) {
|
|||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create ToNumber operation on ds
|
||||
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>("float64");
|
||||
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeFloat64);
|
||||
EXPECT_NE(to_number, nullptr);
|
||||
|
||||
// Create a Map operation on ds
|
||||
|
@ -1651,7 +1651,7 @@ TEST_F(MindDataTestPipeline, TestToNumberFail1) {
|
|||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create ToNumber operation on ds
|
||||
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>("int8");
|
||||
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt8);
|
||||
EXPECT_NE(to_number, nullptr);
|
||||
|
||||
// Create a Map operation on ds
|
||||
|
@ -1701,7 +1701,7 @@ TEST_F(MindDataTestPipeline, TestToNumberFail2) {
|
|||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create ToNumber operation on ds
|
||||
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>("float16");
|
||||
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeFloat16);
|
||||
EXPECT_NE(to_number, nullptr);
|
||||
|
||||
// Create a Map operation on ds
|
||||
|
@ -1747,7 +1747,7 @@ TEST_F(MindDataTestPipeline, TestToNumberFail3) {
|
|||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create ToNumber operation on ds
|
||||
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>("int64");
|
||||
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt64);
|
||||
EXPECT_NE(to_number, nullptr);
|
||||
|
||||
// Create a Map operation on ds
|
||||
|
@ -1789,7 +1789,7 @@ TEST_F(MindDataTestPipeline, TestToNumberFail4) {
|
|||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create ToNumber operation on ds
|
||||
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>("string");
|
||||
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kObjectTypeString);
|
||||
EXPECT_NE(to_number, nullptr);
|
||||
|
||||
// Create a Map operation on ds
|
||||
|
@ -1812,7 +1812,7 @@ TEST_F(MindDataTestPipeline, TestToNumberFail5) {
|
|||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create ToNumber operation on ds
|
||||
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>("bool");
|
||||
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeBool);
|
||||
EXPECT_NE(to_number, nullptr);
|
||||
|
||||
// Create a Map operation on ds
|
||||
|
|
|
@ -36,10 +36,10 @@ class MindDataTestPipeline : public UT::DatasetOpTesting {
|
|||
};
|
||||
|
||||
// Macro to compare 2 MSTensors as not equal; compare datasize only
|
||||
#define EXPECT_MSTENSOR_DATA_NE(_mstensor1, _mstensor2) \
|
||||
do { \
|
||||
EXPECT_NE(_mstensor1.DataSize(), _mstensor2.DataSize()); \
|
||||
} while (false)
|
||||
#define EXPECT_MSTENSOR_DATA_NE(_mstensor1, _mstensor2) \
|
||||
do { \
|
||||
EXPECT_NE(_mstensor1.DataSize(), _mstensor2.DataSize()); \
|
||||
} while (false)
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestVocabLookupOp) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupOp.";
|
||||
|
@ -56,7 +56,8 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOp) {
|
|||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
// Create Lookup operation on ds
|
||||
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::Lookup>(vocab, "<unk>", "int32");
|
||||
std::shared_ptr<TensorTransform> lookup =
|
||||
std::make_shared<text::Lookup>(vocab, "<unk>", mindspore::DataType::kNumberTypeInt32);
|
||||
EXPECT_NE(lookup, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
|
@ -87,6 +88,11 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOp) {
|
|||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 6);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestVocabLookupOpEmptyString) {
|
||||
|
@ -104,7 +110,8 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOpEmptyString) {
|
|||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
// Create Lookup operation on ds
|
||||
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::Lookup>(vocab, "", "int32");
|
||||
std::shared_ptr<TensorTransform> lookup =
|
||||
std::make_shared<text::Lookup>(vocab, "", mindspore::DataType::kNumberTypeInt32);
|
||||
EXPECT_NE(lookup, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
|
@ -135,6 +142,60 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOpEmptyString) {
|
|||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 6);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestVocabLookupBool) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupBool.";
|
||||
// Invoke Lookup with Bool data_type
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testVocab/words.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create a vocab from vector
|
||||
std::vector<std::string> list = {"home", "IS", "behind", "the", "world", "ahead", "!"};
|
||||
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
|
||||
Status s = Vocab::BuildFromVector(list, {"<pad>", "<unk>"}, true, &vocab);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
// Create Lookup operation on ds
|
||||
std::shared_ptr<TensorTransform> lookup =
|
||||
std::make_shared<text::Lookup>(vocab, "<unk>", mindspore::DataType::kNumberTypeBool);
|
||||
EXPECT_NE(lookup, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({lookup}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
// This will trigger the creation of the Execution Tree and launch it.
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, mindspore::MSTensor> row;
|
||||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
|
||||
uint64_t i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
MS_LOG(INFO) << ind.Shape();
|
||||
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
|
||||
|
||||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 6);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestVocabLookupOpFail1) {
|
||||
|
@ -151,7 +212,8 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOpFail1) {
|
|||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
// Create lookup op for ds
|
||||
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::Lookup>(vocab, "<unk>", "int32");
|
||||
std::shared_ptr<TensorTransform> lookup =
|
||||
std::make_shared<text::Lookup>(vocab, "<unk>", mindspore::DataType::kNumberTypeInt32);
|
||||
EXPECT_NE(lookup, nullptr);
|
||||
|
||||
// Create a Map operation on ds
|
||||
|
@ -174,7 +236,8 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOpFail2) {
|
|||
std::shared_ptr<Vocab> vocab;
|
||||
|
||||
// Create lookup op
|
||||
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::Lookup>(vocab, "", "int32");
|
||||
std::shared_ptr<TensorTransform> lookup =
|
||||
std::make_shared<text::Lookup>(vocab, "", mindspore::DataType::kNumberTypeInt32);
|
||||
EXPECT_NE(lookup, nullptr);
|
||||
|
||||
// Create a Map operation on ds
|
||||
|
@ -186,6 +249,33 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOpFail2) {
|
|||
EXPECT_EQ(iter, nullptr);
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestVocabLookupOpFail3DataType) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupOpFail3DataType.";
|
||||
// Create a TextFile Dataset
|
||||
std::string data_file = datasets_root_path_ + "/testVocab/words.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Build vocab from vector
|
||||
std::vector<std::string> list = {"home", "IS", "behind", "the", "world", "ahead", "!"};
|
||||
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
|
||||
Status s = Vocab::BuildFromVector(list, {"<pad>", "<unk>"}, true, &vocab);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
// Create lookup op for ds
|
||||
std::shared_ptr<TensorTransform> lookup =
|
||||
std::make_shared<text::Lookup>(vocab, "", mindspore::DataType::kObjectTypeString);
|
||||
EXPECT_NE(lookup, nullptr);
|
||||
|
||||
// Create a Map operation on ds
|
||||
ds = ds->Map({lookup});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
// Expect failure: invalid Lookup input (String is not valid for data_type)
|
||||
EXPECT_EQ(iter, nullptr);
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestVocabFromDataset) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabFromDataset.";
|
||||
|
||||
|
@ -204,7 +294,8 @@ TEST_F(MindDataTestPipeline, TestVocabFromDataset) {
|
|||
EXPECT_EQ(home_index, 4);
|
||||
|
||||
// Create Lookup operation on ds
|
||||
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::Lookup>(vocab, "<unk>", "int32");
|
||||
std::shared_ptr<TensorTransform> lookup =
|
||||
std::make_shared<text::Lookup>(vocab, "<unk>", mindspore::DataType::kNumberTypeInt32);
|
||||
EXPECT_NE(lookup, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
|
@ -235,6 +326,11 @@ TEST_F(MindDataTestPipeline, TestVocabFromDataset) {
|
|||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 6);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestVocabFromDatasetDefault) {
|
||||
|
@ -254,6 +350,7 @@ TEST_F(MindDataTestPipeline, TestVocabFromDatasetDefault) {
|
|||
EXPECT_EQ(home_index, 2);
|
||||
|
||||
// Create Lookup operation on ds
|
||||
// Use default data_type parameter
|
||||
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::Lookup>(vocab, "home");
|
||||
EXPECT_NE(lookup, nullptr);
|
||||
|
||||
|
@ -293,6 +390,11 @@ TEST_F(MindDataTestPipeline, TestVocabFromDatasetDefault) {
|
|||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 6);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestVocabFromDatasetFail1) {
|
||||
|
@ -371,7 +473,8 @@ TEST_F(MindDataTestPipeline, TestVocabFromDatasetInt64) {
|
|||
EXPECT_EQ(home_index, 2);
|
||||
|
||||
// Create Lookup operation on ds
|
||||
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::Lookup>(vocab, "home", "int64");
|
||||
std::shared_ptr<TensorTransform> lookup =
|
||||
std::make_shared<text::Lookup>(vocab, "home", mindspore::DataType::kNumberTypeInt64);
|
||||
EXPECT_NE(lookup, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
|
@ -410,4 +513,9 @@ TEST_F(MindDataTestPipeline, TestVocabFromDatasetInt64) {
|
|||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 6);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
|
|
@ -202,7 +202,7 @@ def test_lookup_cast_type():
|
|||
assert test_config("unk") == np.dtype("int32")
|
||||
# test exception, data_type isn't the correct type
|
||||
assert "tldr is not of type [<class 'mindspore._c_expression.typing.Type'>]" in test_config("unk", "tldr")
|
||||
assert "Lookup does not support a string to string mapping, data_type can only be numeric." in \
|
||||
assert "Lookup : The parameter data_type must be numeric including bool." in \
|
||||
test_config("w1", mstype.string)
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue