!21690 [assistant][ops] Add data operator Vectors

Merge pull request !21690 from 张渝/Vectors
This commit is contained in:
i-robot 2021-12-01 01:29:45 +00:00 committed by Gitee
commit eb9537af11
28 changed files with 1391 additions and 20 deletions

View File

@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -18,13 +18,13 @@
#include "pybind11/stl_bind.h"
#include "minddata/dataset/api/python/pybind_register.h"
#include "minddata/dataset/text/vocab.h"
#include "minddata/dataset/text/sentence_piece_vocab.h"
#include "minddata/dataset/include/dataset/constants.h"
#include "minddata/dataset/text/sentence_piece_vocab.h"
#include "minddata/dataset/text/vectors.h"
#include "minddata/dataset/text/vocab.h"
namespace mindspore {
namespace dataset {
PYBIND_REGISTER(Vocab, 0, ([](const py::module *m) {
(void)py::class_<Vocab, std::shared_ptr<Vocab>>(*m, "Vocab")
.def(py::init<>())
@ -88,5 +88,14 @@ PYBIND_REGISTER(SentencePieceModel, 0, ([](const py::module *m) {
.export_values();
}));
PYBIND_REGISTER(Vectors, 0, ([](const py::module *m) {
(void)py::class_<Vectors, std::shared_ptr<Vectors>>(*m, "Vectors")
.def(py::init<>())
.def_static("from_file", [](const std::string &path, int32_t max_vectors) {
std::shared_ptr<Vectors> vectors;
THROW_IF_ERROR(Vectors::BuildFromFile(&vectors, path, max_vectors));
return vectors;
});
}));
} // namespace dataset
} // namespace mindspore

View File

@ -19,6 +19,7 @@
#include "minddata/dataset/api/python/pybind_register.h"
#include "minddata/dataset/text/ir/kernels/text_ir.h"
#include "minddata/dataset/text/sentence_piece_vocab.h"
#include "minddata/dataset/text/vectors.h"
#include "minddata/dataset/text/vocab.h"
namespace mindspore {
@ -208,6 +209,18 @@ PYBIND_REGISTER(ToNumberOperation, 1, ([](const py::module *m) {
}));
}));
PYBIND_REGISTER(
ToVectorsOperation, 1, ([](const py::module *m) {
(void)py::class_<text::ToVectorsOperation, TensorOperation, std::shared_ptr<text::ToVectorsOperation>>(
*m, "ToVectorsOperation")
.def(py::init(
[](const std::shared_ptr<Vectors> &vectors, const std::vector<float> &unk_init, bool lower_case_backup) {
auto to_vectors = std::make_shared<text::ToVectorsOperation>(vectors, unk_init, lower_case_backup);
THROW_IF_ERROR(to_vectors->ValidateParams());
return to_vectors;
}));
}));
PYBIND_REGISTER(TruncateSequencePairOperation, 1, ([](const py::module *m) {
(void)py::class_<text::TruncateSequencePairOperation, TensorOperation,
std::shared_ptr<text::TruncateSequencePairOperation>>(

View File

@ -358,6 +358,22 @@ ToNumber::ToNumber(mindspore::DataType data_type) : data_(std::make_shared<Data>
std::shared_ptr<TensorOperation> ToNumber::Parse() { return std::make_shared<ToNumberOperation>(data_->data_type_); }
// ToVectors
struct ToVectors::Data {
Data(const std::shared_ptr<Vectors> &vectors, const std::vector<float> &unk_init, bool lower_case_backup)
: vectors_(vectors), unk_init_(unk_init), lower_case_backup_(lower_case_backup) {}
std::shared_ptr<Vectors> vectors_;
std::vector<float> unk_init_;
bool lower_case_backup_;
};
ToVectors::ToVectors(const std::shared_ptr<Vectors> &vectors, const std::vector<float> unk_init, bool lower_case_backup)
: data_(std::make_shared<Data>(vectors, unk_init, lower_case_backup)) {}
std::shared_ptr<TensorOperation> ToVectors::Parse() {
return std::make_shared<ToVectorsOperation>(data_->vectors_, data_->unk_init_, data_->lower_case_backup_);
}
// TruncateSequencePair
struct TruncateSequencePair::Data {
explicit Data(int32_t max_length) : max_length_(max_length) {}

View File

@ -31,13 +31,13 @@
namespace mindspore {
namespace dataset {
class Vocab;
class SentencePieceVocab;
class TensorOperation;
class Vectors;
class Vocab;
// Transform operations for text
namespace text {
#ifndef _WIN32
/// \brief Tokenize a scalar tensor of UTF-8 string by specific rules.
/// \note BasicTokenizer is not supported on the Windows platform yet.
@ -629,6 +629,30 @@ class MS_API ToNumber final : public TensorTransform {
std::shared_ptr<Data> data_;
};
/// \brief Look up a token into an vector according to the input Vectors table.
class ToVectors final : public TensorTransform {
public:
/// \brief Constructor.
/// \param[in] vectors A Vectors object.
/// \param[in] unk_init In case of the token is out-of-vectors (OOV), the result will be initialized with `unk_init`.
/// (default={}, means to initialize with zero vectors).
/// \param[in] lower_case_backup Whether to look up the token in the lower case (default=false).
explicit ToVectors(const std::shared_ptr<Vectors> &vectors, std::vector<float> unk_init = {},
bool lower_case_backup = false);
/// \brief Destructor
~ToVectors() = default;
protected:
/// \brief The function to convert a TensorTransform object into a TensorOperation object.
/// \return Shared pointer to the TensorOperation object.
std::shared_ptr<TensorOperation> Parse() override;
private:
struct Data;
std::shared_ptr<Data> data_;
};
/// \brief Truncate a pair of rank-1 tensors such that the total length is less than max_length.
class MS_API TruncateSequencePair final : public TensorTransform {
public:

View File

@ -133,6 +133,7 @@ constexpr char kNormalizeUTF8Op[] = "NormalizeUTF8Op";
constexpr char kRegexReplaceOp[] = "RegexReplaceOp";
constexpr char kRegexTokenizerOp[] = "RegexTokenizerOp";
constexpr char kToNumberOp[] = "ToNumberOp";
constexpr char kToVectorsOp[] = "ToVectorsOp";
constexpr char kTruncateSequencePairOp[] = "TruncateSequencePairOp";
constexpr char kUnicodeCharTokenizerOp[] = "UnicodeCharTokenizerOp";
constexpr char kUnicodeScriptTokenizerOp[] = "UnicodeScriptTokenizerOp";

View File

@ -4,6 +4,7 @@ add_subdirectory(kernels)
file(GLOB _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
add_library(text OBJECT
vectors.cc
vocab.cc
sentence_piece_vocab.cc
)

View File

@ -33,6 +33,7 @@
#include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
#include "minddata/dataset/text/kernels/sliding_window_op.h"
#include "minddata/dataset/text/kernels/to_number_op.h"
#include "minddata/dataset/text/kernels/to_vectors_op.h"
#include "minddata/dataset/text/kernels/truncate_sequence_pair_op.h"
#include "minddata/dataset/text/kernels/unicode_char_tokenizer_op.h"
#include "minddata/dataset/text/kernels/wordpiece_tokenizer_op.h"
@ -420,6 +421,27 @@ Status ToNumberOperation::from_json(nlohmann::json op_params, std::shared_ptr<Te
return Status::OK();
}
// ToVectorsOperation
ToVectorsOperation::ToVectorsOperation(const std::shared_ptr<Vectors> &vectors, const std::vector<float> &unk_init,
bool lower_case_backup)
: vectors_(vectors), unk_init_(unk_init), lower_case_backup_(lower_case_backup) {}
ToVectorsOperation::~ToVectorsOperation() = default;
Status ToVectorsOperation::ValidateParams() {
if (vectors_ == nullptr) {
std::string err_msg = "ToVectors: vectors can't be nullptr.";
MS_LOG(ERROR) << err_msg;
LOG_AND_RETURN_STATUS_SYNTAX_ERROR(err_msg);
}
return Status::OK();
}
std::shared_ptr<TensorOp> ToVectorsOperation::Build() {
std::shared_ptr<ToVectorsOp> tensor_op = std::make_shared<ToVectorsOp>(vectors_, unk_init_, lower_case_backup_);
return tensor_op;
}
// TruncateSequencePairOperation
TruncateSequencePairOperation::TruncateSequencePairOperation(int32_t max_length) : max_length_(max_length) {}

View File

@ -27,6 +27,7 @@
namespace mindspore {
namespace dataset {
class Vectors;
class Vocab;
class SentencePieceVocab;
@ -45,6 +46,7 @@ constexpr char kRegexTokenizerOperation[] = "RegexTokenizer";
constexpr char kSentencepieceTokenizerOperation[] = "SentencepieceTokenizer";
constexpr char kSlidingWindowOperation[] = "SlidingWindow";
constexpr char kToNumberOperation[] = "ToNumber";
constexpr char kToVectorsOperation[] = "ToVectors";
constexpr char kTruncateSequencePairOperation[] = "TruncateSequencePair";
constexpr char kUnicodeCharTokenizerOperation[] = "UnicodeCharTokenizer";
constexpr char kUnicodeScriptTokenizerOperation[] = "UnicodeScriptTokenizer";
@ -294,6 +296,25 @@ class ToNumberOperation : public TensorOperation {
DataType data_type_;
};
class ToVectorsOperation : public TensorOperation {
public:
ToVectorsOperation(const std::shared_ptr<Vectors> &vectors, const std::vector<float> &unk_init,
bool lower_case_backup);
~ToVectorsOperation();
std::shared_ptr<TensorOp> Build() override;
Status ValidateParams() override;
std::string Name() const override { return kToVectorsOperation; }
private:
std::shared_ptr<Vectors> vectors_;
std::vector<float> unk_init_;
bool lower_case_backup_;
};
class TruncateSequencePairOperation : public TensorOperation {
public:
explicit TruncateSequencePairOperation(int32_t max_length);

View File

@ -22,6 +22,7 @@ add_library(text-kernels OBJECT
wordpiece_tokenizer_op.cc
truncate_sequence_pair_op.cc
to_number_op.cc
to_vectors_op.cc
sentence_piece_tokenizer_op.cc
${ICU_DEPEND_FILES}
)

View File

@ -0,0 +1,58 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "minddata/dataset/text/kernels/to_vectors_op.h"
namespace mindspore {
namespace dataset {
ToVectorsOp::ToVectorsOp(const std::shared_ptr<Vectors> &vectors, const std::vector<float> &unk_init,
bool lower_case_backup)
: vectors_(vectors), unk_init_(unk_init), lower_case_backup_(lower_case_backup) {}
Status ToVectorsOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING, "ToVectors: input tensor type should be string.");
CHECK_FAIL_RETURN_UNEXPECTED(unk_init_.size() == 0 || unk_init_.size() == vectors_->Dim(),
"ToVectors: unk_init must be the same length as vectors, but got unk_init: " +
std::to_string(unk_init_.size()) + " and vectors: " + std::to_string(vectors_->Dim()));
std::vector<float> vectors_vec;
int len = 0;
for (auto itr = input->begin<std::string_view>(); itr != input->end<std::string_view>(); ++itr) {
std::vector<float> vectors_value = vectors_->Lookup(std::string(*itr), unk_init_, lower_case_backup_);
CHECK_FAIL_RETURN_UNEXPECTED(!vectors_value.empty(), "ToVectors: invalid data, token: \"" + std::string(*itr) +
"\" doesn't exist in vectors and no unk_init is specified.");
vectors_vec.insert(vectors_vec.end(), vectors_value.begin(), vectors_value.end());
len++;
}
int dim = static_cast<int>(vectors_vec.size() / len);
if (vectors_vec.size() == dim) {
RETURN_IF_NOT_OK(Tensor::CreateFromVector(vectors_vec, output));
} else {
RETURN_IF_NOT_OK(Tensor::CreateFromVector(vectors_vec, TensorShape({len, dim}), output));
}
return Status::OK();
}
Status ToVectorsOp::OutputType(const std::vector<DataType> &inputs, std::vector<DataType> &outputs) {
CHECK_FAIL_RETURN_UNEXPECTED(inputs.size() == NumInput() && outputs.size() == NumOutput(),
"ToVectors: input and output size don't match.");
CHECK_FAIL_RETURN_UNEXPECTED(inputs[0] == DataType::DE_STRING, "ToVectors: input tensor type should be string.");
outputs[0] = DataType(DataType::DE_FLOAT32);
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

View File

@ -0,0 +1,64 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_KERNELS_TO_VECTORS_OP_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_KERNELS_TO_VECTORS_OP_H_
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "minddata/dataset/core/tensor.h"
#include "minddata/dataset/kernels/tensor_op.h"
#include "minddata/dataset/text/vectors.h"
#include "minddata/dataset/util/status.h"
namespace mindspore {
namespace dataset {
class ToVectorsOp : public TensorOp {
public:
/// \brief Constructor.
/// \param[in] vectors Vectors used to lookup tokens.
/// \param[in] unk_init Vector used to initialize OOV token.
/// \param[in] lower_case_backup Whether to look up the token in the lower case.
ToVectorsOp(const std::shared_ptr<Vectors> &vectors, const std::vector<float> &unk_init, bool lower_case_backup);
/// \brief Destructor.
~ToVectorsOp() = default;
/// \brief Perform actual ToVectors on each tensor.
/// \param[in] input Input tensor.
/// \param[in] output Output tensor.
/// \return[out] Status code.
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
/// \param[in] inputs DataType of input tensor.
/// \param[in] outputs DataType of output tensor.
/// \return[out] Status code.
Status OutputType(const std::vector<DataType> &inputs, std::vector<DataType> &outputs) override;
/// \brief Get Op name.
std::string Name() const override { return kToVectorsOp; }
private:
std::shared_ptr<Vectors> vectors_;
std::vector<float> unk_init_;
bool lower_case_backup_;
};
} // namespace dataset
} // namespace mindspore
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_KERNELS_TO_VECTORS_OP_H_

View File

@ -0,0 +1,145 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "minddata/dataset/text/vectors.h"
#include "utils/file_utils.h"
namespace mindspore {
namespace dataset {
Status Vectors::InferShape(const std::string &path, int32_t max_vectors, int32_t *num_lines, int32_t *header_num_lines,
int32_t *vector_dim) {
RETURN_UNEXPECTED_IF_NULL(num_lines);
RETURN_UNEXPECTED_IF_NULL(header_num_lines);
RETURN_UNEXPECTED_IF_NULL(vector_dim);
std::ifstream file_reader;
file_reader.open(path, std::ios::in);
CHECK_FAIL_RETURN_UNEXPECTED(file_reader.is_open(), "Vectors: invalid file, failed to open vector file: " + path);
*num_lines = 0, *header_num_lines = 0, *vector_dim = -1;
std::string line, row;
while (std::getline(file_reader, line)) {
if (*vector_dim == -1) {
std::vector<std::string> vec;
std::istringstream line_reader(line);
while (std::getline(line_reader, row, ' ')) {
vec.push_back(row);
}
// The number of rows and dimensions can be obtained directly from the information header.
const int kInfoHeaderSize = 2;
if (vec.size() == kInfoHeaderSize) {
(*header_num_lines)++;
} else {
*vector_dim = vec.size() - 1;
(*num_lines)++;
}
} else {
(*num_lines)++;
}
}
CHECK_FAIL_RETURN_UNEXPECTED(*num_lines > 0, "Vectors: invalid file, file is empty.");
if (max_vectors > 0) {
*num_lines = std::min(max_vectors, *num_lines); // Determine the true rows.
}
return Status::OK();
}
Status Vectors::Load(const std::string &path, int32_t max_vectors,
std::unordered_map<std::string, std::vector<float>> *map, int *vector_dim) {
RETURN_UNEXPECTED_IF_NULL(map);
RETURN_UNEXPECTED_IF_NULL(vector_dim);
auto realpath = FileUtils::GetRealPath(common::SafeCStr(path));
CHECK_FAIL_RETURN_UNEXPECTED(realpath.has_value(), "Vectors: get real path failed, path: " + path);
auto file_path = realpath.value();
CHECK_FAIL_RETURN_UNEXPECTED(max_vectors >= 0,
"Vectors: max_vectors must be non negative, but got: " + std::to_string(max_vectors));
int num_lines = 0, header_num_lines = 0;
RETURN_IF_NOT_OK(InferShape(file_path, max_vectors, &num_lines, &header_num_lines, vector_dim));
std::fstream file_reader;
file_reader.open(file_path, std::ios::in);
CHECK_FAIL_RETURN_UNEXPECTED(file_reader.is_open(),
"Vectors: invalid file, failed to open vector file: " + file_path);
while (header_num_lines > 0) {
file_reader.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
header_num_lines--;
}
std::string line, token, vector_value;
for (auto i = 0; i < num_lines; ++i) {
std::getline(file_reader, line);
std::istringstream line_reader(line);
std::getline(line_reader, token, ' ');
std::vector<float> vector_values;
int dim = 0;
while (line_reader >> vector_value) {
dim++;
vector_values.push_back(atof(vector_value.c_str()));
}
CHECK_FAIL_RETURN_UNEXPECTED(dim > 1, "Vectors: token with 1-dimensional vector.");
CHECK_FAIL_RETURN_UNEXPECTED(dim == *vector_dim,
"Vectors: all vectors must have the same number of dimensions, but got dim " +
std::to_string(dim) + " while expecting " + std::to_string(*vector_dim));
auto token_index = map->find(token);
if (token_index == map->end()) {
(*map)[token] = vector_values;
}
}
return Status::OK();
}
Vectors::Vectors(const std::unordered_map<std::string, std::vector<float>> &map, int dim) {
map_ = std::move(map);
dim_ = dim;
}
Status Vectors::BuildFromFile(std::shared_ptr<Vectors> *vectors, const std::string &path, int32_t max_vectors) {
std::unordered_map<std::string, std::vector<float>> map;
int vector_dim = -1;
RETURN_IF_NOT_OK(Load(path, max_vectors, &map, &vector_dim));
*vectors = std::make_shared<Vectors>(std::move(map), vector_dim);
return Status::OK();
}
std::vector<float> Vectors::Lookup(const std::string &token, const std::vector<float> &unk_init,
bool lower_case_backup) {
std::vector<float> init_vec(dim_, 0);
if (!unk_init.empty()) {
if (unk_init.size() != dim_) {
MS_LOG(WARNING) << "Vectors: size of unk_init is not the same as vectors, will initialize with zero vectors.";
} else {
init_vec = unk_init;
}
}
std::string lower_token = token;
if (lower_case_backup) {
transform(lower_token.begin(), lower_token.end(), lower_token.begin(), ::tolower);
}
auto str_index = map_.find(lower_token);
if (str_index == map_.end()) {
return init_vec;
} else {
return str_index->second;
}
}
} // namespace dataset
} // namespace mindspore

View File

@ -0,0 +1,89 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_VECTORS_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_VECTORS_H_
#include <algorithm>
#include <fstream>
#include <limits>
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "minddata/dataset/core/tensor.h"
#include "minddata/dataset/include/dataset/iterator.h"
namespace mindspore {
namespace dataset {
/// \brief Pre-train word vectors.
class Vectors {
public:
/// Constructor.
Vectors() = default;
/// Constructor.
/// \param[in] map A map between string and vector.
/// \param[in] dim Dimension of the vectors.
Vectors(const std::unordered_map<std::string, std::vector<float>> &map, int dim);
/// Destructor.
virtual ~Vectors() = default;
/// \brief Build Vectors from reading a pre-train vector file.
/// \param[out] vectors Vectors object which contains the pre-train vectors.
/// \param[in] path Path to the pre-trained word vector file.
/// \param[in] max_vectors This can be used to limit the number of pre-trained vectors loaded (default=0, no limit).
static Status BuildFromFile(std::shared_ptr<Vectors> *vectors, const std::string &path, int32_t max_vectors = 0);
/// \brief Look up embedding vectors of token.
/// \param[in] token A token to be looked up.
/// \param[in] unk_init In case of the token is out-of-vectors (OOV), the result will be initialized with `unk_init`.
/// (default={}, means to initialize with zero vectors).
/// \param[in] lower_case_backup Whether to look up the token in the lower case (Default = false).
/// \return The vector of the input token.
virtual std::vector<float> Lookup(const std::string &token, const std::vector<float> &unk_init = {},
bool lower_case_backup = false);
/// \brief Getter of dimension.
const int &Dim() const { return dim_; }
protected:
/// \brief Infer the shape of the pre-trained word vector file.
/// \param[in] path Path to the pre-trained word vector file.
/// \param[in] max_vectors Maximum number of pre-trained word vectors to be read.
/// \param[out] num_lines The number of lines of the file.
/// \param[out] header_num_lines The number of lines of file header.
/// \param[out] vector_dim The dimension of the vectors in the file.
static Status InferShape(const std::string &path, int32_t max_vectors, int32_t *num_lines, int32_t *header_num_lines,
int32_t *vector_dim);
/// \brief Load map from reading a pre-train vector file.
/// \param[in] path Path to the pre-trained word vector file.
/// \param[in] max_vectors This can be used to limit the number of pre-trained vectors loaded, must be non negative.
/// \param[out] map The map between words and vectors.
/// \param[out] vector_dim The dimension of the vectors in the file.
static Status Load(const std::string &path, int32_t max_vectors,
std::unordered_map<std::string, std::vector<float>> *map, int *vector_dim);
int dim_;
std::unordered_map<std::string, std::vector<float>> map_;
};
} // namespace dataset
} // namespace mindspore
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_VECTORS_H_

View File

@ -26,15 +26,15 @@ Common imported modules in corresponding API examples are as follows:
"""
import platform
from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer, \
TruncateSequencePair, ToNumber, SlidingWindow, SentencePieceTokenizer, PythonTokenizer
TruncateSequencePair, ToNumber, SlidingWindow, SentencePieceTokenizer, PythonTokenizer, ToVectors
from .utils import to_str, to_bytes, JiebaMode, Vocab, NormalizeForm, SentencePieceVocab, SentencePieceModel, \
SPieceTokenizerOutType, SPieceTokenizerLoadType
SPieceTokenizerOutType, SPieceTokenizerLoadType, Vectors
__all__ = [
"Lookup", "JiebaTokenizer", "UnicodeCharTokenizer", "Ngram",
"to_str", "to_bytes", "Vocab", "WordpieceTokenizer", "TruncateSequencePair", "ToNumber",
"PythonTokenizer", "SlidingWindow", "SentencePieceVocab", "SentencePieceTokenizer", "SPieceTokenizerOutType",
"SentencePieceModel", "SPieceTokenizerLoadType", "JiebaMode", "NormalizeForm",
"SentencePieceModel", "SPieceTokenizerLoadType", "JiebaMode", "NormalizeForm", "Vectors", "ToVectors"
]
if platform.system().lower() != 'windows':

View File

@ -48,7 +48,7 @@ import mindspore._c_dataengine as cde
from mindspore.common import dtype as mstype
from .utils import JiebaMode, NormalizeForm, to_str, SPieceTokenizerOutType, SPieceTokenizerLoadType
from .validators import check_lookup, check_jieba_add_dict, \
from .validators import check_lookup, check_jieba_add_dict, check_to_vectors, \
check_jieba_add_word, check_jieba_init, check_with_offsets, check_unicode_script_tokenizer, \
check_wordpiece_tokenizer, check_regex_replace, check_regex_tokenizer, check_basic_tokenizer, check_ngram, \
check_pair_truncate, check_to_number, check_bert_tokenizer, check_python_tokenizer, check_slidingwindow, \
@ -345,6 +345,7 @@ class SentencePieceTokenizer(TextTensorOperation):
>>> tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
>>> text_file_dataset = text_file_dataset.map(operations=tokenizer)
"""
@check_sentence_piece_tokenizer
def __init__(self, mode, out_type):
self.mode = mode
@ -421,6 +422,36 @@ class ToNumber(TextTensorOperation):
return cde.ToNumberOperation(self.data_type)
class ToVectors(TextTensorOperation):
"""
Look up a token into vectors according to the input vector table.
Args:
vectors (Vectors): A vectors object.
unk_init (sequence, optional): Sequence used to initialize out-of-vectors (OOV) token
(default=None, initialize with zero vectors).
lower_case_backup (bool, optional): Whether to look up the token in the lower case. If False, each token in the
original case will be looked up; if True, each token in the original case will be looked up first, if not
found in the keys of the property stoi, the token in the lower case will be looked up (default=False).
Examples:
>>> # Load vectors from file
>>> vectors = text.Vectors.from_file("/path/to/vectors/file")
>>> # Use ToVectors operator to map tokens to vectors
>>> to_vectors = text.ToVectors(vectors)
>>> text_file_dataset = text_file_dataset.map(operations=[to_vectors])
"""
@check_to_vectors
def __init__(self, vectors, unk_init=None, lower_case_backup=False):
self.vectors = vectors
self.unk_init = unk_init if unk_init is not None else []
self.lower_case_backup = lower_case_backup
def parse(self):
return cde.ToVectorsOperation(self.vectors, self.unk_init, self.lower_case_backup)
class TruncateSequencePair(TextTensorOperation):
"""
Truncate a pair of rank-1 tensors such that the total length is less than max_length.

View File

@ -1,4 +1,4 @@
# Copyright 2020 Huawei Technologies Co., Ltd
# Copyright 2020-2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -16,16 +16,18 @@ The module text.utils provides some general methods for NLP text processing.
For example, you can use Vocab to build a dictionary,
use to_bytes and to_str to encode and decode strings into a specified format.
"""
from enum import IntEnum
import numpy as np
import mindspore._c_dataengine as cde
import mindspore._c_dataengine as cde
from .validators import check_from_file, check_from_list, check_from_dict, check_from_dataset, \
check_from_dataset_sentencepiece, check_from_file_sentencepiece, check_save_model
check_from_dataset_sentencepiece, check_from_file_sentencepiece, check_save_model, \
check_from_file_vectors
__all__ = [
"Vocab", "SentencePieceVocab", "to_str", "to_bytes"
"Vocab", "SentencePieceVocab", "to_str", "to_bytes", "Vectors"
]
@ -383,3 +385,29 @@ class SPieceTokenizerLoadType(IntEnum):
"""
FILE = 0
MODEL = 1
class Vectors(cde.Vectors):
"""
Vectors object that is used to map tokens into vectors.
"""
@classmethod
@check_from_file_vectors
def from_file(cls, file_path, max_vectors=None):
"""
Build a vector from a file.
Args:
file_path (str): Path of the file that contains the vectors.
max_vectors (int, optional): This can be used to limit the number of pre-trained vectors loaded.
Most pre-trained vector sets are sorted in the descending order of word frequency. Thus, in
situations where the entire set doesnt fit in memory, or is not needed for another reason,
passing max_vectors can limit the size of the loaded set (default=None, no limit).
Examples:
>>> vector = text.Vectors.from_file("/path/to/vectors/file", max_vectors=None)
"""
max_vectors = max_vectors if max_vectors is not None else 0
return super().from_file(file_path, max_vectors)

View File

@ -15,15 +15,14 @@
"""
validators for text ops
"""
from functools import wraps
import mindspore.common.dtype as mstype
import mindspore._c_dataengine as cde
import mindspore.common.dtype as mstype
from mindspore._c_expression import typing
from ..core.validator_helpers import parse_user_args, type_check, type_check_list, check_uint32, \
INT32_MAX, check_value, check_positive, check_pos_int32
INT32_MAX, check_value, check_positive, check_pos_int32, check_filename, check_non_negative_int32
def check_unique_list_of_words(words, arg_name):
@ -532,3 +531,39 @@ def check_sentence_piece_tokenizer(method):
return method(self, *args, **kwargs)
return new_method
def check_from_file_vectors(method):
"""A wrapper that wraps a parameter checker to from_file of class Vectors."""
@wraps(method)
def new_method(self, *args, **kwargs):
[file_path, max_vectors], _ = parse_user_args(method, *args, **kwargs)
type_check(file_path, (str,), "file_path")
check_filename(file_path)
if max_vectors is not None:
type_check(max_vectors, (int,), "max_vectors")
check_non_negative_int32(max_vectors, "max_vectors")
return method(self, *args, **kwargs)
return new_method
def check_to_vectors(method):
"""A wrapper that wraps a parameter checker to ToVectors."""
@wraps(method)
def new_method(self, *args, **kwargs):
[vectors, unk_init, lower_case_backup], _ = parse_user_args(method, *args, **kwargs)
type_check(vectors, (cde.Vectors,), "vectors")
if unk_init is not None:
type_check(unk_init, (list, tuple), "unk_init")
for i, value in enumerate(unk_init):
type_check(value, (int, float), "unk_init[{0}]".format(i))
type_check(lower_case_backup, (bool,), "lower_case_backup")
return method(self, *args, **kwargs)
return new_method

View File

@ -52,6 +52,7 @@ SET(DE_UT_SRCS
c_api_samplers_test.cc
c_api_text_sentence_piece_vocab_test.cc
c_api_text_vocab_test.cc
c_api_text_test.cc
c_api_transforms_test.cc
c_api_vision_a_to_q_test.cc
c_api_vision_affine_test.cc

View File

@ -14,8 +14,8 @@
* limitations under the License.
*/
#include <memory>
#include <vector>
#include <string>
#include <vector>
#include "common/common.h"
#include "include/api/status.h"
@ -23,12 +23,14 @@
#include "minddata/dataset/include/dataset/datasets.h"
#include "minddata/dataset/include/dataset/text.h"
#include "minddata/dataset/include/dataset/transforms.h"
#include "minddata/dataset/text/vectors.h"
#include "minddata/dataset/text/vocab.h"
using namespace mindspore::dataset;
using mindspore::Status;
using mindspore::dataset::ShuffleMode;
using mindspore::dataset::Tensor;
using mindspore::dataset::Vectors;
using mindspore::dataset::Vocab;
class MindDataTestPipeline : public UT::DatasetOpTesting {
@ -892,7 +894,7 @@ TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess2) {
std::vector<std::string> expected_tokens = {"今天天气", "太好了", "我们", "一起", "", "外面", "玩吧"};
std::vector<uint32_t> expected_offsets_start = {0, 12, 21, 27, 33, 36, 42};
std::vector<uint32_t> expected_offsets_limit = {12, 21, 27, 33, 36, 42, 48};
std::shared_ptr<Tensor> de_expected_tokens;
ASSERT_OK(Tensor::CreateFromVector(expected_tokens, &de_expected_tokens));
mindspore::MSTensor ms_expected_tokens =
@ -1596,7 +1598,8 @@ TEST_F(MindDataTestPipeline, TestToNumberSuccess2) {
EXPECT_NE(ds, nullptr);
// Create ToNumber operation on ds
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeFloat64);
std::shared_ptr<TensorTransform> to_number =
std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeFloat64);
EXPECT_NE(to_number, nullptr);
// Create a Map operation on ds
@ -3543,3 +3546,400 @@ TEST_F(MindDataTestPipeline, TestWhitespaceTokenizerSuccess1) {
// Manually terminate the pipeline
iter->Stop();
}
/// Feature: Vectors
/// Description: test with default parameter in function BuildFromFile and function Lookup
/// Expectation: return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline, TestVectorsDefaultParam) {
// Test with default parameter.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsDefaultParam.";
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
std::shared_ptr<Vectors> vectors;
Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
EXPECT_EQ(s, Status::OK());
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors);
EXPECT_NE(lookup, nullptr);
// Create Map operation on ds
ds = ds->Map({lookup}, {"text"});
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));
uint64_t i = 0;
std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
{0, 0, 0, 0, 0, 0},
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
{0, 0, 0, 0, 0, 0}};
while (row.size() != 0) {
auto ind = row["text"];
MS_LOG(INFO) << ind.Shape();
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
TensorPtr de_expected_item;
dsize_t dim = 6;
ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
mindspore::MSTensor ms_expected_item =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
ASSERT_OK(iter->GetNextRow(&row));
i++;
}
EXPECT_EQ(i, 7);
// Manually terminate the pipeline
iter->Stop();
}
/// Feature: Vectors
/// Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile
/// Expectation: return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline, TestVectorsAllBuildfromfileParams) {
// Test with two parameters.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsAllBuildfromfileParams.";
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
std::shared_ptr<Vectors> vectors;
Status s = Vectors::BuildFromFile(&vectors, vectors_dir, 100);
EXPECT_EQ(s, Status::OK());
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors);
EXPECT_NE(lookup, nullptr);
// Create Map operation on ds
ds = ds->Map({lookup}, {"text"});
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));
uint64_t i = 0;
std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
{0, 0, 0, 0, 0, 0},
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
{0, 0, 0, 0, 0, 0}};
while (row.size() != 0) {
auto ind = row["text"];
MS_LOG(INFO) << ind.Shape();
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
TensorPtr de_expected_item;
dsize_t dim = 6;
ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
mindspore::MSTensor ms_expected_item =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
ASSERT_OK(iter->GetNextRow(&row));
i++;
}
EXPECT_EQ(i, 7);
// Manually terminate the pipeline
iter->Stop();
}
/// Feature: Vectors
/// Description: test with all parameters in function BuildFromFile and `unknown_init` in function Lookup
/// Expectation: return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline, TestVectorsUnknownInit) {
// Test with two parameters.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsUnknownInit.";
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
std::shared_ptr<Vectors> vectors;
Status s = Vectors::BuildFromFile(&vectors, vectors_dir, 100);
EXPECT_EQ(s, Status::OK());
std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors, unknown_init);
EXPECT_NE(lookup, nullptr);
// Create Map operation on ds
ds = ds->Map({lookup}, {"text"});
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));
uint64_t i = 0;
std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
{-1, -1, -1, -1, -1, -1},
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
{-1, -1, -1, -1, -1, -1}};
while (row.size() != 0) {
auto ind = row["text"];
MS_LOG(INFO) << ind.Shape();
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
TensorPtr de_expected_item;
dsize_t dim = 6;
ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
mindspore::MSTensor ms_expected_item =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
ASSERT_OK(iter->GetNextRow(&row));
i++;
}
EXPECT_EQ(i, 7);
// Manually terminate the pipeline
iter->Stop();
}
/// Feature: Vectors
/// Description: test with all parameters which include `path` and `max_vectors` in function BuildFromFile and `token`,
/// `unknown_init` and `lower_case_backup` in function Lookup. But some tokens have some big letters
/// Expectation: return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline, TestVectorsAllParams) {
// Test with all parameters.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsAllParams.";
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
std::shared_ptr<Vectors> vectors;
Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
EXPECT_EQ(s, Status::OK());
std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors, unknown_init, true);
EXPECT_NE(lookup, nullptr);
// Create Map operation on ds
ds = ds->Map({lookup}, {"text"});
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));
uint64_t i = 0;
std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
{-1, -1, -1, -1, -1, -1},
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
{-1, -1, -1, -1, -1, -1}};
while (row.size() != 0) {
auto ind = row["text"];
MS_LOG(INFO) << ind.Shape();
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
TensorPtr de_expected_item;
dsize_t dim = 6;
ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
mindspore::MSTensor ms_expected_item =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
ASSERT_OK(iter->GetNextRow(&row));
i++;
}
EXPECT_EQ(i, 7);
// Manually terminate the pipeline
iter->Stop();
}
/// Feature: Vectors
/// Description: test with pre-vectors set that have the different dimension
/// Expectation: throw correct error and message
TEST_F(MindDataTestPipeline, TestVectorsDifferentDimension) {
// Tokens don't have the same number of vectors.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsDifferentDimension.";
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_dim_different.txt";
std::shared_ptr<Vectors> vectors;
Status s = Vectors::BuildFromFile(&vectors, vectors_dir, 100);
EXPECT_NE(s, Status::OK());
}
/// Feature: Vectors
/// Description: test with pre-vectors set that has the head-info
/// Expectation: return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline, TestVectorsWithHeadInfo) {
// Test with words that has head info.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithHeadInfo.";
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_with_info.txt";
std::shared_ptr<Vectors> vectors;
Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
EXPECT_EQ(s, Status::OK());
std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors, unknown_init, true);
EXPECT_NE(lookup, nullptr);
// Create Map operation on ds
ds = ds->Map({lookup}, {"text"});
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));
uint64_t i = 0;
std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
{-1, -1, -1, -1, -1, -1},
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
{-1, -1, -1, -1, -1, -1}};
while (row.size() != 0) {
auto ind = row["text"];
MS_LOG(INFO) << ind.Shape();
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
TensorPtr de_expected_item;
dsize_t dim = 6;
ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
mindspore::MSTensor ms_expected_item =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
ASSERT_OK(iter->GetNextRow(&row));
i++;
}
EXPECT_EQ(i, 7);
// Manually terminate the pipeline
iter->Stop();
}
/// Feature: Vectors
/// Description: test with the parameter max_vectors that is <= 0
/// Expectation: throw correct error and message
TEST_F(MindDataTestPipeline, TestVectorsMaxVectorsLessThanZero) {
// Test with max_vectors <= 0.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsMaxVectorsLessThanZero.";
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
std::shared_ptr<Vectors> vectors;
Status s = Vectors::BuildFromFile(&vectors, vectors_dir, -1);
EXPECT_NE(s, Status::OK());
}
/// Feature: Vectors
/// Description: test with the pre-vectors file that is empty
/// Expectation: throw correct error and message
TEST_F(MindDataTestPipeline, TestVectorsWithEmptyFile) {
// Read empty file.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithEmptyFile.";
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_empty.txt";
std::shared_ptr<Vectors> vectors;
Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
EXPECT_NE(s, Status::OK());
}
/// Feature: Vectors
/// Description: test with the pre-vectors file that is not exist
/// Expectation: throw correct error and message
TEST_F(MindDataTestPipeline, TestVectorsWithNotExistFile) {
// Test with not exist file.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithNotExistFile.";
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
std::string vectors_dir = datasets_root_path_ + "/testVectors/no_vectors.txt";
std::shared_ptr<Vectors> vectors;
Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
EXPECT_NE(s, Status::OK());
}
/// Feature: Vectors
/// Description: test with the pre-vectors set that has a situation that info-head is not the first line in the set
/// Expectation: throw correct error and message
TEST_F(MindDataTestPipeline, TestVectorsWithWrongInfoFile) {
// wrong info.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithWrongInfoFile.";
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_with_wrong_info.txt";
std::shared_ptr<Vectors> vectors;
Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
EXPECT_NE(s, Status::OK());
}

View File

@ -23,10 +23,12 @@
#include "minddata/dataset/include/dataset/vision.h"
#include "minddata/dataset/include/dataset/audio.h"
#include "minddata/dataset/include/dataset/text.h"
#include "minddata/dataset/text/vectors.h"
#include "utils/log_adapter.h"
using namespace mindspore::dataset;
using mindspore::LogStream;
using mindspore::dataset::Vectors;
using mindspore::ExceptionType::NoExceptionType;
using mindspore::MsLogLevel::INFO;
@ -1529,6 +1531,140 @@ TEST_F(MindDataTestExecute, TestFlangerWithWrongArg) {
EXPECT_FALSE(s01.IsOk());
}
/// Feature: Vectors
/// Description: test basic usage of Vectors and the ToVectors with default parameter
/// Expectation: get correct MSTensor
TEST_F(MindDataTestExecute, TestVectorsParam) {
MS_LOG(INFO) << "Doing MindDataTestExecute-TestVectorsParam.";
std::shared_ptr<Tensor> de_tensor;
Tensor::CreateScalar<std::string>("ok", &de_tensor);
auto token = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor));
mindspore::MSTensor lookup_result;
// Create expected output.
std::shared_ptr<Tensor> de_expected;
std::vector<float> expected = {0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411};
dsize_t dim = 6;
ASSERT_OK(Tensor::CreateFromVector(expected, TensorShape({dim}), &de_expected));
auto ms_expected = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected));
// Transform params.
std::string vectors_dir = "data/dataset/testVectors/vectors.txt";
std::shared_ptr<Vectors> vectors01;
Status s01 = Vectors::BuildFromFile(&vectors01, vectors_dir);
EXPECT_EQ(s01, Status::OK());
std::shared_ptr<TensorTransform> to_vectors01 = std::make_shared<text::ToVectors>(vectors01);
auto transform01 = Execute({to_vectors01});
Status status01 = transform01(token, &lookup_result);
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected);
EXPECT_TRUE(status01.IsOk());
std::shared_ptr<Vectors> vectors02;
Status s02 = Vectors::BuildFromFile(&vectors02, vectors_dir, 100);
EXPECT_EQ(s02, Status::OK());
std::shared_ptr<TensorTransform> to_vectors02 = std::make_shared<text::ToVectors>(vectors02);
auto transform02 = Execute({to_vectors02});
Status status02 = transform02(token, &lookup_result);
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected);
EXPECT_TRUE(status02.IsOk());
std::shared_ptr<Vectors> vectors03;
Status s03 = Vectors::BuildFromFile(&vectors03, vectors_dir, 3);
EXPECT_EQ(s03, Status::OK());
std::shared_ptr<TensorTransform> to_vectors03 = std::make_shared<text::ToVectors>(vectors03);
auto transform03 = Execute({to_vectors03});
Status status03 = transform03(token, &lookup_result);
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected);
EXPECT_TRUE(status03.IsOk());
}
/// Feature: ToVectors
/// Description: test basic usage of ToVectors and the Vectors with default parameter
/// Expectation: get correct MSTensor
TEST_F(MindDataTestExecute, TestToVectorsParam) {
MS_LOG(INFO) << "Doing MindDataTestExecute-TestToVectorsParam.";
std::shared_ptr<Tensor> de_tensor01;
Tensor::CreateScalar<std::string>("none", &de_tensor01);
auto token01 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor01));
std::shared_ptr<Tensor> de_tensor02;
Tensor::CreateScalar<std::string>("ok", &de_tensor02);
auto token02 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor02));
std::shared_ptr<Tensor> de_tensor03;
Tensor::CreateScalar<std::string>("OK", &de_tensor03);
auto token03 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor03));
mindspore::MSTensor lookup_result;
// Create expected output.
dsize_t dim = 6;
std::shared_ptr<Tensor> de_expected01;
std::vector<float> expected01 = {0, 0, 0, 0, 0, 0};
ASSERT_OK(Tensor::CreateFromVector(expected01, TensorShape({dim}), &de_expected01));
auto ms_expected01 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected01));
std::shared_ptr<Tensor> de_expected02;
std::vector<float> expected02 = {-1, -1, -1, -1, -1, -1};
ASSERT_OK(Tensor::CreateFromVector(expected02, TensorShape({dim}), &de_expected02));
auto ms_expected02 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected02));
std::shared_ptr<Tensor> de_expected03;
std::vector<float> expected03 = {0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411};
ASSERT_OK(Tensor::CreateFromVector(expected03, TensorShape({dim}), &de_expected03));
auto ms_expected03 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected03));
// Transform params.
std::string vectors_dir = "data/dataset/testVectors/vectors.txt";
std::shared_ptr<Vectors> vectors;
Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
EXPECT_EQ(s, Status::OK());
std::shared_ptr<TensorTransform> to_vectors01 = std::make_shared<text::ToVectors>(vectors);
auto transform01 = Execute({to_vectors01});
Status status01 = transform01(token01, &lookup_result);
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected01);
EXPECT_TRUE(status01.IsOk());
std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
std::shared_ptr<TensorTransform> to_vectors02 = std::make_shared<text::ToVectors>(vectors, unknown_init);
auto transform02 = Execute({to_vectors02});
Status status02 = transform02(token01, &lookup_result);
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected02);
EXPECT_TRUE(status02.IsOk());
std::shared_ptr<TensorTransform> to_vectors03 = std::make_shared<text::ToVectors>(vectors, unknown_init);
auto transform03 = Execute({to_vectors03});
Status status03 = transform03(token02, &lookup_result);
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected03);
EXPECT_TRUE(status03.IsOk());
std::shared_ptr<TensorTransform> to_vectors04 = std::make_shared<text::ToVectors>(vectors, unknown_init, true);
auto transform04 = Execute({to_vectors04});
Status status04 = transform04(token03, &lookup_result);
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected03);
EXPECT_TRUE(status04.IsOk());
}
/// Feature: ToVectors
/// Description: test invalid parameter of ToVectors
/// Expectation: throw exception correctly
TEST_F(MindDataTestExecute, TestToVectorsWithInvalidParam) {
MS_LOG(INFO) << "Doing MindDataTestExecute-TestToVectorsWithInvalidParam.";
std::shared_ptr<Tensor> de_tensor;
Tensor::CreateScalar<std::string>("none", &de_tensor);
auto token = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor));
mindspore::MSTensor lookup_result;
// Transform params.
std::string vectors_dir = "data/dataset/testVectors/vectors.txt";
std::shared_ptr<Vectors> vectors01;
Status s = Vectors::BuildFromFile(&vectors01, vectors_dir);
EXPECT_EQ(s, Status::OK());
std::vector<float> unknown_init = {-1, -1, -1, -1};
std::shared_ptr<TensorTransform> to_vectors01 = std::make_shared<text::ToVectors>(vectors01, unknown_init);
auto transform01 = Execute({to_vectors01});
Status status01 = transform01(token, &lookup_result);
EXPECT_FALSE(status01.IsOk());
std::shared_ptr<Vectors> vectors02 = nullptr;
std::shared_ptr<TensorTransform> to_vectors02 = std::make_shared<text::ToVectors>(vectors02);
auto transform02 = Execute({to_vectors02});
Status status02 = transform02(token, &lookup_result);
EXPECT_FALSE(status02.IsOk());
}
// Feature: DBToAmplitude
// Description: test DBToAmplitude in eager mode
// Expectation: the data is processed successfully

View File

@ -0,0 +1,6 @@
ok 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445718411
! 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709
this 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973
is 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603
my 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246
home 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923

View File

@ -0,0 +1,6 @@
ok 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445718411
! 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709
this 0.15164 0.30177 -0.16763 0.17684 0.31719
is 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603
my 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246
home 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923

View File

@ -0,0 +1,7 @@
6 6
ok 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445718411
! 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709
this 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973
is 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603
my 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246
home 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923

View File

@ -0,0 +1,7 @@
the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445718411
, 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709
. 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973
6 6
of 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603
to 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246
and 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923

View File

@ -0,0 +1,7 @@
ok
.
this
is
my
home
.

View File

@ -0,0 +1,7 @@
ok
!
This
iS
my
HOME
.

View File

@ -0,0 +1,236 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import numpy as np
import pytest
from mindspore import log
import mindspore.dataset as ds
import mindspore.dataset.text as text
import mindspore.dataset.text.transforms as T
DATASET_ROOT_PATH = "../data/dataset/testVectors/"
def test_vectors_all_tovectors_params_eager():
"""
Feature: Vectors
Description: test with all parameters which include `unk_init`
and `lower_case_backup` in function ToVectors in eager mode
Expectation: output is equal to the expected value
"""
vectors = text.Vectors.from_file(DATASET_ROOT_PATH + "vectors.txt", max_vectors=4)
myUnk = [-1, -1, -1, -1, -1, -1]
to_vectors = T.ToVectors(vectors, unk_init=myUnk, lower_case_backup=True)
result1 = to_vectors("Ok")
result2 = to_vectors("!")
result3 = to_vectors("This")
result4 = to_vectors("is")
result5 = to_vectors("my")
result6 = to_vectors("home")
result7 = to_vectors("none")
res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
[0.013441, 0.23682, -0.16899, 0.40951, 0.63812, 0.47709],
[0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
[0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
[-1, -1, -1, -1, -1, -1],
[-1, -1, -1, -1, -1, -1],
[-1, -1, -1, -1, -1, -1]]
res_array = np.array(res, dtype=np.float32)
assert np.array_equal(result1, res_array[0])
assert np.array_equal(result2, res_array[1])
assert np.array_equal(result3, res_array[2])
assert np.array_equal(result4, res_array[3])
assert np.array_equal(result5, res_array[4])
assert np.array_equal(result6, res_array[5])
assert np.array_equal(result7, res_array[6])
def test_vectors_from_file():
"""
Feature: Vectors
Description: test with only default parameter
Expectation: output is equal to the expected value
"""
vectors = text.Vectors.from_file(DATASET_ROOT_PATH + "vectors.txt")
to_vectors = text.ToVectors(vectors)
data = ds.TextFileDataset(DATASET_ROOT_PATH + "words.txt", shuffle=False)
data = data.map(operations=to_vectors, input_columns=["text"])
ind = 0
res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
[0, 0, 0, 0, 0, 0],
[0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
[0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
[0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246],
[0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923],
[0, 0, 0, 0, 0, 0]]
for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
res_array = np.array(res[ind], dtype=np.float32)
assert np.array_equal(res_array, d["text"]), ind
ind += 1
def test_vectors_from_file_all_buildfromfile_params():
"""
Feature: Vectors
Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile
Expectation: output is equal to the expected value
"""
vectors = text.Vectors.from_file(DATASET_ROOT_PATH + "vectors.txt", max_vectors=100)
to_vectors = text.ToVectors(vectors)
data = ds.TextFileDataset(DATASET_ROOT_PATH + "words.txt", shuffle=False)
data = data.map(operations=to_vectors, input_columns=["text"])
ind = 0
res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
[0, 0, 0, 0, 0, 0],
[0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
[0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
[0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246],
[0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923],
[0, 0, 0, 0, 0, 0]]
print(data)
for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
res_array = np.array(res[ind], dtype=np.float32)
assert np.array_equal(res_array, d["text"]), ind
ind += 1
def test_vectors_from_file_all_buildfromfile_params_eager():
"""
Feature: Vectors
Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile in eager mode
Expectation: output is equal to the expected value
"""
vectors = text.Vectors.from_file(DATASET_ROOT_PATH + "vectors.txt", max_vectors=4)
to_vectors = T.ToVectors(vectors)
result1 = to_vectors("ok")
result2 = to_vectors("!")
result3 = to_vectors("this")
result4 = to_vectors("is")
result5 = to_vectors("my")
result6 = to_vectors("home")
result7 = to_vectors("none")
res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
[0.013441, 0.23682, -0.16899, 0.40951, 0.63812, 0.47709],
[0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
[0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
[0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0]]
res_array = np.array(res, dtype=np.float32)
assert np.array_equal(result1, res_array[0])
assert np.array_equal(result2, res_array[1])
assert np.array_equal(result3, res_array[2])
assert np.array_equal(result4, res_array[3])
assert np.array_equal(result5, res_array[4])
assert np.array_equal(result6, res_array[5])
assert np.array_equal(result7, res_array[6])
def test_vectors_from_file_eager():
"""
Feature: Vectors
Description: test with only default parameter in eager mode
Expectation: output is equal to the expected value
"""
vectors = text.Vectors.from_file(DATASET_ROOT_PATH + "vectors.txt")
to_vectors = T.ToVectors(vectors)
result1 = to_vectors("ok")
result2 = to_vectors("!")
result3 = to_vectors("this")
result4 = to_vectors("is")
result5 = to_vectors("my")
result6 = to_vectors("home")
result7 = to_vectors("none")
res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
[0.013441, 0.23682, -0.16899, 0.40951, 0.63812, 0.47709],
[0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
[0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
[0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246],
[0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923],
[0, 0, 0, 0, 0, 0]]
res_array = np.array(res, dtype=np.float32)
assert np.array_equal(result1, res_array[0])
assert np.array_equal(result2, res_array[1])
assert np.array_equal(result3, res_array[2])
assert np.array_equal(result4, res_array[3])
assert np.array_equal(result5, res_array[4])
assert np.array_equal(result6, res_array[5])
assert np.array_equal(result7, res_array[6])
def test_vectors_invalid_input():
"""
Feature: Vectors
Description: test the validate function with invalid parameters.
Expectation:
"""
def test_invalid_input(test_name, file_path, error, error_msg, max_vectors=None,
unk_init=None, lower_case_backup=False, token="ok"):
log.info("Test Vectors with wrong input: {0}".format(test_name))
with pytest.raises(error) as error_info:
vectors = text.Vectors.from_file(file_path, max_vectors=max_vectors)
to_vectors = T.ToVectors(vectors, unk_init=unk_init, lower_case_backup=lower_case_backup)
to_vectors(token)
assert error_msg in str(error_info.value)
test_invalid_input("Not all vectors have the same number of dimensions",
DATASET_ROOT_PATH + "vectors_dim_different.txt", error=RuntimeError,
error_msg="all vectors must have the same number of dimensions, but got dim 5 while expecting 6")
test_invalid_input("the file is empty.", DATASET_ROOT_PATH + "vectors_empty.txt",
error=RuntimeError, error_msg="invalid file, file is empty.")
test_invalid_input("the count of `unknown_init`'s element is different with word vector.",
DATASET_ROOT_PATH + "vectors.txt",
error=RuntimeError, error_msg="Unexpected error. ToVectors: " +
"unk_init must be the same length as vectors, but got unk_init: 2 and vectors: 6",
unk_init=[-1, -1])
test_invalid_input("The file not exist", DATASET_ROOT_PATH + "not_exist.txt", error=RuntimeError,
error_msg="get real path failed")
test_invalid_input("The token is 1-dimensional",
DATASET_ROOT_PATH + "vectors_with_wrong_info.txt", error=RuntimeError,
error_msg="token with 1-dimensional vector.")
test_invalid_input("max_vectors parameter must be greater than 0",
DATASET_ROOT_PATH + "vectors.txt", error=ValueError,
error_msg="Input max_vectors is not within the required interval", max_vectors=-1)
test_invalid_input("invalid max_vectors parameter type as a float",
DATASET_ROOT_PATH + "vectors.txt", error=TypeError,
error_msg="Argument max_vectors with value 1.0 is not of type [<class 'int'>],"
" but got <class 'float'>.", max_vectors=1.0)
test_invalid_input("invalid max_vectors parameter type as a string",
DATASET_ROOT_PATH + "vectors.txt", error=TypeError,
error_msg="Argument max_vectors with value 1 is not of type [<class 'int'>],"
" but got <class 'str'>.", max_vectors="1")
test_invalid_input("invalid token parameter type as a float", DATASET_ROOT_PATH + "vectors.txt", error=RuntimeError,
error_msg="input tensor type should be string.", token=1.0)
test_invalid_input("invalid lower_case_backup parameter type as a string", DATASET_ROOT_PATH + "vectors.txt",
error=TypeError, error_msg="Argument lower_case_backup with " +
"value True is not of type [<class 'bool'>],"
" but got <class 'str'>.", lower_case_backup="True")
test_invalid_input("invalid lower_case_backup parameter type as a string", DATASET_ROOT_PATH + "vectors.txt",
error=TypeError, error_msg="Argument lower_case_backup with " +
"value True is not of type [<class 'bool'>],"
" but got <class 'str'>.", lower_case_backup="True")
if __name__ == '__main__':
test_vectors_all_tovectors_params_eager()
test_vectors_from_file()
test_vectors_from_file_all_buildfromfile_params()
test_vectors_from_file_all_buildfromfile_params_eager()
test_vectors_from_file_eager()
test_vectors_invalid_input()