dataset: C++ API ToNumber & Lookup: change data_type parm from string to DataType

This commit is contained in:
Cathy Wong 2021-04-16 14:57:27 -04:00
parent ade65bac93
commit 636809c255
8 changed files with 197 additions and 45 deletions

View File

@ -19,7 +19,8 @@
#include <regex>
#include "minddata/dataset/include/text.h"
#include "mindspore/core/ir/dtype/type_id.h"
#include "minddata/dataset/core/type_id.h"
#include "minddata/dataset/text/ir/kernels/text_ir.h"
namespace mindspore {
@ -203,16 +204,20 @@ Status JiebaTokenizer::ParserFile(const std::string &file_path,
// Lookup
struct Lookup::Data {
Data(const std::shared_ptr<Vocab> &vocab, const std::optional<std::vector<char>> &unknown_token,
const std::vector<char> &data_type)
: vocab_(vocab), unknown_token_(OptionalCharToString(unknown_token)), data_type_(CharToString(data_type)) {}
mindspore::DataType data_type)
: vocab_(vocab),
unknown_token_(OptionalCharToString(unknown_token)),
data_type_(dataset::MSTypeToDEType(static_cast<TypeId>(data_type))) {}
std::shared_ptr<Vocab> vocab_;
std::optional<std::string> unknown_token_;
std::string data_type_;
dataset::DataType data_type_;
};
Lookup::Lookup(const std::shared_ptr<Vocab> &vocab, const std::optional<std::vector<char>> &unknown_token,
const std::vector<char> &data_type)
: data_(std::make_shared<Data>(vocab, unknown_token, data_type)) {}
mindspore::DataType data_type)
: data_(std::make_shared<Data>(vocab, unknown_token, data_type)) {
data_->data_type_ = dataset::MSTypeToDEType(static_cast<TypeId>(data_type));
}
std::shared_ptr<TensorOperation> Lookup::Parse() {
return std::make_shared<LookupOperation>(data_->vocab_, data_->unknown_token_, data_->data_type_);
@ -331,11 +336,12 @@ std::shared_ptr<TensorOperation> SlidingWindow::Parse() {
// ToNumber
struct ToNumber::Data {
explicit Data(const std::vector<char> &data_type) : data_type_(CharToString(data_type)) {}
std::string data_type_;
dataset::DataType data_type_;
};
ToNumber::ToNumber(const std::vector<char> &data_type) : data_(std::make_shared<Data>(data_type)) {}
ToNumber::ToNumber(mindspore::DataType data_type) : data_(std::make_shared<Data>()) {
data_->data_type_ = dataset::MSTypeToDEType(static_cast<TypeId>(data_type));
}
std::shared_ptr<TensorOperation> ToNumber::Parse() { return std::make_shared<ToNumberOperation>(data_->data_type_); }

View File

@ -207,13 +207,14 @@ class Lookup final : public TensorTransform {
/// \param[in] unknown_token word to use for lookup if the word being looked up is out of Vocabulary (oov).
/// If unknown_token is oov, runtime error will be thrown. If unknown_token is {}, which means that not to
/// specify unknown_token when word being out of Vocabulary (default={}).
/// \param[in] data_type type of the tensor after lookup, typically int32.
/// \param[in] data_type mindspore::DataType of the tensor after lookup; must be numeric, including bool.
/// (default=mindspore::DataType::kNumberTypeInt32).
explicit Lookup(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token = {},
const std::string &data_type = "int32")
: Lookup(vocab, OptionalStringToChar(unknown_token), StringToChar(data_type)) {}
mindspore::DataType data_type = mindspore::DataType::kNumberTypeInt32)
: Lookup(vocab, OptionalStringToChar(unknown_token), data_type) {}
explicit Lookup(const std::shared_ptr<Vocab> &vocab, const std::optional<std::vector<char>> &unknown_token,
const std::vector<char> &data_type);
mindspore::DataType data_type = mindspore::DataType::kNumberTypeInt32);
/// \brief Destructor
~Lookup() = default;
@ -405,10 +406,8 @@ class SlidingWindow final : public TensorTransform {
class ToNumber final : public TensorTransform {
public:
/// \brief Constructor.
/// \param[in] data_type of the tensor to be cast to. Must be a numeric type.
explicit ToNumber(const std::string &data_type) : ToNumber(StringToChar(data_type)) {}
explicit ToNumber(const std::vector<char> &data_type);
/// \param[in] data_type mindspore::DataType of the tensor to be cast to. Must be a numeric type, excluding bool.
explicit ToNumber(mindspore::DataType data_type);
/// \brief Destructor
~ToNumber() = default;

View File

@ -42,6 +42,7 @@
#include "minddata/dataset/text/kernels/whitespace_tokenizer_op.h"
#endif
#include "minddata/dataset/core/data_type.h"
#include "minddata/dataset/core/type_id.h"
#include "minddata/dataset/util/path.h"
#include "minddata/dataset/text/ir/validators.h"
@ -166,9 +167,19 @@ Status JiebaTokenizerOperation::AddWord(const std::string &word, int64_t freq) {
}
// LookupOperation
// DataType data_type - required for C++ API
LookupOperation::LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token,
DataType data_type)
: vocab_(vocab), unknown_token_(unknown_token), default_id_(Vocab::kNoTokenExists), data_type_(data_type) {}
// std::string data_type - required for Pybind
LookupOperation::LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token,
const std::string &data_type)
: vocab_(vocab), unknown_token_(unknown_token), default_id_(Vocab::kNoTokenExists), data_type_(data_type) {}
: vocab_(vocab), unknown_token_(unknown_token), default_id_(Vocab::kNoTokenExists) {
// Convert from string to DEType
DataType temp_data_type(data_type);
data_type_ = temp_data_type;
}
LookupOperation::~LookupOperation() = default;
@ -187,8 +198,9 @@ Status LookupOperation::ValidateParams() {
}
}
if (!IsTypeNumeric(data_type_)) {
std::string err_msg = "Lookup does not support a string to string mapping, data_type can only be numeric.";
if (!data_type_.IsNumeric()) {
// Note: For DEType, Bool is counted as numeric, and is a valid type for Lookup
std::string err_msg = "Lookup : The parameter data_type must be numeric including bool.";
MS_LOG(ERROR) << err_msg;
RETURN_STATUS_SYNTAX_ERROR(err_msg);
}
@ -351,11 +363,20 @@ std::shared_ptr<TensorOp> SlidingWindowOperation::Build() {
}
// ToNumberOperation
ToNumberOperation::ToNumberOperation(std::string data_type) : data_type_(data_type) {}
// DataType data_type - required for C++ API
ToNumberOperation::ToNumberOperation(DataType data_type) : data_type_(data_type) {}
// std::string data_type - required for Pybind
ToNumberOperation::ToNumberOperation(std::string data_type) {
// Convert from string to DEType
DataType temp_data_type(data_type);
data_type_ = temp_data_type;
}
Status ToNumberOperation::ValidateParams() {
if (!IsTypeNumeric(data_type_) || IsTypeBoolean(data_type_)) {
std::string err_msg = "ToNumber : The parameter data_type must be a numeric type, got: " + data_type_;
if (!data_type_.IsNumeric() || data_type_.IsBool()) {
// Note: For DEType, Bool is counted as numeric, but is not a valid type for ToNumber.
std::string err_msg = "ToNumber : The parameter data_type must be numeric and excludes bool.";
MS_LOG(ERROR) << err_msg;
RETURN_STATUS_SYNTAX_ERROR(err_msg);
}
@ -368,6 +389,13 @@ std::shared_ptr<TensorOp> ToNumberOperation::Build() {
return tensor_op;
}
Status ToNumberOperation::to_json(nlohmann::json *out_json) {
nlohmann::json args;
args["data_type"] = data_type_.ToString();
*out_json = args;
return Status::OK();
}
// TruncateSequencePairOperation
TruncateSequencePairOperation::TruncateSequencePairOperation(int32_t max_length) : max_length_(max_length) {}

View File

@ -142,7 +142,9 @@ class JiebaTokenizerOperation : public TensorOperation {
class LookupOperation : public TensorOperation {
public:
explicit LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token,
const std::string &data_type);
DataType data_type); // Used for C++ API
explicit LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token,
const std::string &data_type); // Used for Pybind
~LookupOperation();
@ -156,7 +158,7 @@ class LookupOperation : public TensorOperation {
std::shared_ptr<Vocab> vocab_;
std::optional<std::string> unknown_token_;
int32_t default_id_;
std::string data_type_;
DataType data_type_;
};
class NgramOperation : public TensorOperation {
@ -273,7 +275,8 @@ class SlidingWindowOperation : public TensorOperation {
class ToNumberOperation : public TensorOperation {
public:
explicit ToNumberOperation(std::string data_type);
explicit ToNumberOperation(DataType data_type); // Used for C++ API
explicit ToNumberOperation(std::string data_type); // Used for Pybind
~ToNumberOperation() = default;
@ -283,8 +286,10 @@ class ToNumberOperation : public TensorOperation {
std::string Name() const override { return kToNumberOperation; }
Status to_json(nlohmann::json *out_json) override;
private:
std::string data_type_;
DataType data_type_;
};
class TruncateSequencePairOperation : public TensorOperation {

View File

@ -89,6 +89,9 @@ TEST_F(MindDataTestPipeline, TestSentencePieceVocabSuccess1) {
}
EXPECT_EQ(i, 1);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestSentencePieceVocabSuccess2) {
@ -149,6 +152,9 @@ TEST_F(MindDataTestPipeline, TestSentencePieceVocabSuccess2) {
}
EXPECT_EQ(i, 1);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestSentencePieceVocabFail) {

View File

@ -1541,7 +1541,7 @@ TEST_F(MindDataTestPipeline, TestToNumberSuccess1) {
EXPECT_NE(ds, nullptr);
// Create ToNumber operation on ds
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>("int64");
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt64);
EXPECT_NE(to_number, nullptr);
// Create a Map operation on ds
@ -1596,7 +1596,7 @@ TEST_F(MindDataTestPipeline, TestToNumberSuccess2) {
EXPECT_NE(ds, nullptr);
// Create ToNumber operation on ds
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>("float64");
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeFloat64);
EXPECT_NE(to_number, nullptr);
// Create a Map operation on ds
@ -1651,7 +1651,7 @@ TEST_F(MindDataTestPipeline, TestToNumberFail1) {
EXPECT_NE(ds, nullptr);
// Create ToNumber operation on ds
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>("int8");
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt8);
EXPECT_NE(to_number, nullptr);
// Create a Map operation on ds
@ -1701,7 +1701,7 @@ TEST_F(MindDataTestPipeline, TestToNumberFail2) {
EXPECT_NE(ds, nullptr);
// Create ToNumber operation on ds
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>("float16");
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeFloat16);
EXPECT_NE(to_number, nullptr);
// Create a Map operation on ds
@ -1747,7 +1747,7 @@ TEST_F(MindDataTestPipeline, TestToNumberFail3) {
EXPECT_NE(ds, nullptr);
// Create ToNumber operation on ds
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>("int64");
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt64);
EXPECT_NE(to_number, nullptr);
// Create a Map operation on ds
@ -1789,7 +1789,7 @@ TEST_F(MindDataTestPipeline, TestToNumberFail4) {
EXPECT_NE(ds, nullptr);
// Create ToNumber operation on ds
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>("string");
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kObjectTypeString);
EXPECT_NE(to_number, nullptr);
// Create a Map operation on ds
@ -1812,7 +1812,7 @@ TEST_F(MindDataTestPipeline, TestToNumberFail5) {
EXPECT_NE(ds, nullptr);
// Create ToNumber operation on ds
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>("bool");
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeBool);
EXPECT_NE(to_number, nullptr);
// Create a Map operation on ds

View File

@ -36,10 +36,10 @@ class MindDataTestPipeline : public UT::DatasetOpTesting {
};
// Macro to compare 2 MSTensors as not equal; compare datasize only
#define EXPECT_MSTENSOR_DATA_NE(_mstensor1, _mstensor2) \
do { \
EXPECT_NE(_mstensor1.DataSize(), _mstensor2.DataSize()); \
} while (false)
#define EXPECT_MSTENSOR_DATA_NE(_mstensor1, _mstensor2) \
do { \
EXPECT_NE(_mstensor1.DataSize(), _mstensor2.DataSize()); \
} while (false)
TEST_F(MindDataTestPipeline, TestVocabLookupOp) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupOp.";
@ -56,7 +56,8 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOp) {
EXPECT_EQ(s, Status::OK());
// Create Lookup operation on ds
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::Lookup>(vocab, "<unk>", "int32");
std::shared_ptr<TensorTransform> lookup =
std::make_shared<text::Lookup>(vocab, "<unk>", mindspore::DataType::kNumberTypeInt32);
EXPECT_NE(lookup, nullptr);
// Create Map operation on ds
@ -87,6 +88,11 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOp) {
ASSERT_OK(iter->GetNextRow(&row));
i++;
}
EXPECT_EQ(i, 6);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestVocabLookupOpEmptyString) {
@ -104,7 +110,8 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOpEmptyString) {
EXPECT_EQ(s, Status::OK());
// Create Lookup operation on ds
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::Lookup>(vocab, "", "int32");
std::shared_ptr<TensorTransform> lookup =
std::make_shared<text::Lookup>(vocab, "", mindspore::DataType::kNumberTypeInt32);
EXPECT_NE(lookup, nullptr);
// Create Map operation on ds
@ -135,6 +142,60 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOpEmptyString) {
ASSERT_OK(iter->GetNextRow(&row));
i++;
}
EXPECT_EQ(i, 6);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestVocabLookupBool) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupBool.";
// Invoke Lookup with Bool data_type
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testVocab/words.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
// Create a vocab from vector
std::vector<std::string> list = {"home", "IS", "behind", "the", "world", "ahead", "!"};
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
Status s = Vocab::BuildFromVector(list, {"<pad>", "<unk>"}, true, &vocab);
EXPECT_EQ(s, Status::OK());
// Create Lookup operation on ds
std::shared_ptr<TensorTransform> lookup =
std::make_shared<text::Lookup>(vocab, "<unk>", mindspore::DataType::kNumberTypeBool);
EXPECT_NE(lookup, nullptr);
// Create Map operation on ds
ds = ds->Map({lookup}, {"text"});
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));
uint64_t i = 0;
while (row.size() != 0) {
auto ind = row["text"];
MS_LOG(INFO) << ind.Shape();
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
ASSERT_OK(iter->GetNextRow(&row));
i++;
}
EXPECT_EQ(i, 6);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestVocabLookupOpFail1) {
@ -151,7 +212,8 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOpFail1) {
EXPECT_EQ(s, Status::OK());
// Create lookup op for ds
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::Lookup>(vocab, "<unk>", "int32");
std::shared_ptr<TensorTransform> lookup =
std::make_shared<text::Lookup>(vocab, "<unk>", mindspore::DataType::kNumberTypeInt32);
EXPECT_NE(lookup, nullptr);
// Create a Map operation on ds
@ -174,7 +236,8 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOpFail2) {
std::shared_ptr<Vocab> vocab;
// Create lookup op
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::Lookup>(vocab, "", "int32");
std::shared_ptr<TensorTransform> lookup =
std::make_shared<text::Lookup>(vocab, "", mindspore::DataType::kNumberTypeInt32);
EXPECT_NE(lookup, nullptr);
// Create a Map operation on ds
@ -186,6 +249,33 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOpFail2) {
EXPECT_EQ(iter, nullptr);
}
TEST_F(MindDataTestPipeline, TestVocabLookupOpFail3DataType) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupOpFail3DataType.";
// Create a TextFile Dataset
std::string data_file = datasets_root_path_ + "/testVocab/words.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
// Build vocab from vector
std::vector<std::string> list = {"home", "IS", "behind", "the", "world", "ahead", "!"};
std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
Status s = Vocab::BuildFromVector(list, {"<pad>", "<unk>"}, true, &vocab);
EXPECT_EQ(s, Status::OK());
// Create lookup op for ds
std::shared_ptr<TensorTransform> lookup =
std::make_shared<text::Lookup>(vocab, "", mindspore::DataType::kObjectTypeString);
EXPECT_NE(lookup, nullptr);
// Create a Map operation on ds
ds = ds->Map({lookup});
EXPECT_NE(ds, nullptr);
std::shared_ptr<Iterator> iter = ds->CreateIterator();
// Expect failure: invalid Lookup input (String is not valid for data_type)
EXPECT_EQ(iter, nullptr);
}
TEST_F(MindDataTestPipeline, TestVocabFromDataset) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabFromDataset.";
@ -204,7 +294,8 @@ TEST_F(MindDataTestPipeline, TestVocabFromDataset) {
EXPECT_EQ(home_index, 4);
// Create Lookup operation on ds
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::Lookup>(vocab, "<unk>", "int32");
std::shared_ptr<TensorTransform> lookup =
std::make_shared<text::Lookup>(vocab, "<unk>", mindspore::DataType::kNumberTypeInt32);
EXPECT_NE(lookup, nullptr);
// Create Map operation on ds
@ -235,6 +326,11 @@ TEST_F(MindDataTestPipeline, TestVocabFromDataset) {
ASSERT_OK(iter->GetNextRow(&row));
i++;
}
EXPECT_EQ(i, 6);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestVocabFromDatasetDefault) {
@ -254,6 +350,7 @@ TEST_F(MindDataTestPipeline, TestVocabFromDatasetDefault) {
EXPECT_EQ(home_index, 2);
// Create Lookup operation on ds
// Use default data_type parameter
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::Lookup>(vocab, "home");
EXPECT_NE(lookup, nullptr);
@ -293,6 +390,11 @@ TEST_F(MindDataTestPipeline, TestVocabFromDatasetDefault) {
ASSERT_OK(iter->GetNextRow(&row));
i++;
}
EXPECT_EQ(i, 6);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestVocabFromDatasetFail1) {
@ -371,7 +473,8 @@ TEST_F(MindDataTestPipeline, TestVocabFromDatasetInt64) {
EXPECT_EQ(home_index, 2);
// Create Lookup operation on ds
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::Lookup>(vocab, "home", "int64");
std::shared_ptr<TensorTransform> lookup =
std::make_shared<text::Lookup>(vocab, "home", mindspore::DataType::kNumberTypeInt64);
EXPECT_NE(lookup, nullptr);
// Create Map operation on ds
@ -410,4 +513,9 @@ TEST_F(MindDataTestPipeline, TestVocabFromDatasetInt64) {
ASSERT_OK(iter->GetNextRow(&row));
i++;
}
EXPECT_EQ(i, 6);
// Manually terminate the pipeline
iter->Stop();
}

View File

@ -202,7 +202,7 @@ def test_lookup_cast_type():
assert test_config("unk") == np.dtype("int32")
# test exception, data_type isn't the correct type
assert "tldr is not of type [<class 'mindspore._c_expression.typing.Type'>]" in test_config("unk", "tldr")
assert "Lookup does not support a string to string mapping, data_type can only be numeric." in \
assert "Lookup : The parameter data_type must be numeric including bool." in \
test_config("w1", mstype.string)