!21690 [assistant][ops] Add data operator Vectors
Merge pull request !21690 from 张渝/Vectors
This commit is contained in:
commit
eb9537af11
|
@ -1,5 +1,5 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
* Copyright 2020-2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
@ -18,13 +18,13 @@
|
|||
#include "pybind11/stl_bind.h"
|
||||
|
||||
#include "minddata/dataset/api/python/pybind_register.h"
|
||||
#include "minddata/dataset/text/vocab.h"
|
||||
#include "minddata/dataset/text/sentence_piece_vocab.h"
|
||||
#include "minddata/dataset/include/dataset/constants.h"
|
||||
#include "minddata/dataset/text/sentence_piece_vocab.h"
|
||||
#include "minddata/dataset/text/vectors.h"
|
||||
#include "minddata/dataset/text/vocab.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
PYBIND_REGISTER(Vocab, 0, ([](const py::module *m) {
|
||||
(void)py::class_<Vocab, std::shared_ptr<Vocab>>(*m, "Vocab")
|
||||
.def(py::init<>())
|
||||
|
@ -88,5 +88,14 @@ PYBIND_REGISTER(SentencePieceModel, 0, ([](const py::module *m) {
|
|||
.export_values();
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(Vectors, 0, ([](const py::module *m) {
|
||||
(void)py::class_<Vectors, std::shared_ptr<Vectors>>(*m, "Vectors")
|
||||
.def(py::init<>())
|
||||
.def_static("from_file", [](const std::string &path, int32_t max_vectors) {
|
||||
std::shared_ptr<Vectors> vectors;
|
||||
THROW_IF_ERROR(Vectors::BuildFromFile(&vectors, path, max_vectors));
|
||||
return vectors;
|
||||
});
|
||||
}));
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#include "minddata/dataset/api/python/pybind_register.h"
|
||||
#include "minddata/dataset/text/ir/kernels/text_ir.h"
|
||||
#include "minddata/dataset/text/sentence_piece_vocab.h"
|
||||
#include "minddata/dataset/text/vectors.h"
|
||||
#include "minddata/dataset/text/vocab.h"
|
||||
|
||||
namespace mindspore {
|
||||
|
@ -208,6 +209,18 @@ PYBIND_REGISTER(ToNumberOperation, 1, ([](const py::module *m) {
|
|||
}));
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(
|
||||
ToVectorsOperation, 1, ([](const py::module *m) {
|
||||
(void)py::class_<text::ToVectorsOperation, TensorOperation, std::shared_ptr<text::ToVectorsOperation>>(
|
||||
*m, "ToVectorsOperation")
|
||||
.def(py::init(
|
||||
[](const std::shared_ptr<Vectors> &vectors, const std::vector<float> &unk_init, bool lower_case_backup) {
|
||||
auto to_vectors = std::make_shared<text::ToVectorsOperation>(vectors, unk_init, lower_case_backup);
|
||||
THROW_IF_ERROR(to_vectors->ValidateParams());
|
||||
return to_vectors;
|
||||
}));
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(TruncateSequencePairOperation, 1, ([](const py::module *m) {
|
||||
(void)py::class_<text::TruncateSequencePairOperation, TensorOperation,
|
||||
std::shared_ptr<text::TruncateSequencePairOperation>>(
|
||||
|
|
|
@ -358,6 +358,22 @@ ToNumber::ToNumber(mindspore::DataType data_type) : data_(std::make_shared<Data>
|
|||
|
||||
std::shared_ptr<TensorOperation> ToNumber::Parse() { return std::make_shared<ToNumberOperation>(data_->data_type_); }
|
||||
|
||||
// ToVectors
|
||||
struct ToVectors::Data {
|
||||
Data(const std::shared_ptr<Vectors> &vectors, const std::vector<float> &unk_init, bool lower_case_backup)
|
||||
: vectors_(vectors), unk_init_(unk_init), lower_case_backup_(lower_case_backup) {}
|
||||
std::shared_ptr<Vectors> vectors_;
|
||||
std::vector<float> unk_init_;
|
||||
bool lower_case_backup_;
|
||||
};
|
||||
|
||||
ToVectors::ToVectors(const std::shared_ptr<Vectors> &vectors, const std::vector<float> unk_init, bool lower_case_backup)
|
||||
: data_(std::make_shared<Data>(vectors, unk_init, lower_case_backup)) {}
|
||||
|
||||
std::shared_ptr<TensorOperation> ToVectors::Parse() {
|
||||
return std::make_shared<ToVectorsOperation>(data_->vectors_, data_->unk_init_, data_->lower_case_backup_);
|
||||
}
|
||||
|
||||
// TruncateSequencePair
|
||||
struct TruncateSequencePair::Data {
|
||||
explicit Data(int32_t max_length) : max_length_(max_length) {}
|
||||
|
|
|
@ -31,13 +31,13 @@
|
|||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
class Vocab;
|
||||
class SentencePieceVocab;
|
||||
class TensorOperation;
|
||||
class Vectors;
|
||||
class Vocab;
|
||||
|
||||
// Transform operations for text
|
||||
namespace text {
|
||||
|
||||
#ifndef _WIN32
|
||||
/// \brief Tokenize a scalar tensor of UTF-8 string by specific rules.
|
||||
/// \note BasicTokenizer is not supported on the Windows platform yet.
|
||||
|
@ -629,6 +629,30 @@ class MS_API ToNumber final : public TensorTransform {
|
|||
std::shared_ptr<Data> data_;
|
||||
};
|
||||
|
||||
/// \brief Look up a token into an vector according to the input Vectors table.
|
||||
class ToVectors final : public TensorTransform {
|
||||
public:
|
||||
/// \brief Constructor.
|
||||
/// \param[in] vectors A Vectors object.
|
||||
/// \param[in] unk_init In case of the token is out-of-vectors (OOV), the result will be initialized with `unk_init`.
|
||||
/// (default={}, means to initialize with zero vectors).
|
||||
/// \param[in] lower_case_backup Whether to look up the token in the lower case (default=false).
|
||||
explicit ToVectors(const std::shared_ptr<Vectors> &vectors, std::vector<float> unk_init = {},
|
||||
bool lower_case_backup = false);
|
||||
|
||||
/// \brief Destructor
|
||||
~ToVectors() = default;
|
||||
|
||||
protected:
|
||||
/// \brief The function to convert a TensorTransform object into a TensorOperation object.
|
||||
/// \return Shared pointer to the TensorOperation object.
|
||||
std::shared_ptr<TensorOperation> Parse() override;
|
||||
|
||||
private:
|
||||
struct Data;
|
||||
std::shared_ptr<Data> data_;
|
||||
};
|
||||
|
||||
/// \brief Truncate a pair of rank-1 tensors such that the total length is less than max_length.
|
||||
class MS_API TruncateSequencePair final : public TensorTransform {
|
||||
public:
|
||||
|
|
|
@ -133,6 +133,7 @@ constexpr char kNormalizeUTF8Op[] = "NormalizeUTF8Op";
|
|||
constexpr char kRegexReplaceOp[] = "RegexReplaceOp";
|
||||
constexpr char kRegexTokenizerOp[] = "RegexTokenizerOp";
|
||||
constexpr char kToNumberOp[] = "ToNumberOp";
|
||||
constexpr char kToVectorsOp[] = "ToVectorsOp";
|
||||
constexpr char kTruncateSequencePairOp[] = "TruncateSequencePairOp";
|
||||
constexpr char kUnicodeCharTokenizerOp[] = "UnicodeCharTokenizerOp";
|
||||
constexpr char kUnicodeScriptTokenizerOp[] = "UnicodeScriptTokenizerOp";
|
||||
|
|
|
@ -4,6 +4,7 @@ add_subdirectory(kernels)
|
|||
file(GLOB _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
|
||||
set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
|
||||
add_library(text OBJECT
|
||||
vectors.cc
|
||||
vocab.cc
|
||||
sentence_piece_vocab.cc
|
||||
)
|
||||
|
|
|
@ -33,6 +33,7 @@
|
|||
#include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
|
||||
#include "minddata/dataset/text/kernels/sliding_window_op.h"
|
||||
#include "minddata/dataset/text/kernels/to_number_op.h"
|
||||
#include "minddata/dataset/text/kernels/to_vectors_op.h"
|
||||
#include "minddata/dataset/text/kernels/truncate_sequence_pair_op.h"
|
||||
#include "minddata/dataset/text/kernels/unicode_char_tokenizer_op.h"
|
||||
#include "minddata/dataset/text/kernels/wordpiece_tokenizer_op.h"
|
||||
|
@ -420,6 +421,27 @@ Status ToNumberOperation::from_json(nlohmann::json op_params, std::shared_ptr<Te
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
// ToVectorsOperation
|
||||
ToVectorsOperation::ToVectorsOperation(const std::shared_ptr<Vectors> &vectors, const std::vector<float> &unk_init,
|
||||
bool lower_case_backup)
|
||||
: vectors_(vectors), unk_init_(unk_init), lower_case_backup_(lower_case_backup) {}
|
||||
|
||||
ToVectorsOperation::~ToVectorsOperation() = default;
|
||||
|
||||
Status ToVectorsOperation::ValidateParams() {
|
||||
if (vectors_ == nullptr) {
|
||||
std::string err_msg = "ToVectors: vectors can't be nullptr.";
|
||||
MS_LOG(ERROR) << err_msg;
|
||||
LOG_AND_RETURN_STATUS_SYNTAX_ERROR(err_msg);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
std::shared_ptr<TensorOp> ToVectorsOperation::Build() {
|
||||
std::shared_ptr<ToVectorsOp> tensor_op = std::make_shared<ToVectorsOp>(vectors_, unk_init_, lower_case_backup_);
|
||||
return tensor_op;
|
||||
}
|
||||
|
||||
// TruncateSequencePairOperation
|
||||
TruncateSequencePairOperation::TruncateSequencePairOperation(int32_t max_length) : max_length_(max_length) {}
|
||||
|
||||
|
|
|
@ -27,6 +27,7 @@
|
|||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
class Vectors;
|
||||
class Vocab;
|
||||
class SentencePieceVocab;
|
||||
|
||||
|
@ -45,6 +46,7 @@ constexpr char kRegexTokenizerOperation[] = "RegexTokenizer";
|
|||
constexpr char kSentencepieceTokenizerOperation[] = "SentencepieceTokenizer";
|
||||
constexpr char kSlidingWindowOperation[] = "SlidingWindow";
|
||||
constexpr char kToNumberOperation[] = "ToNumber";
|
||||
constexpr char kToVectorsOperation[] = "ToVectors";
|
||||
constexpr char kTruncateSequencePairOperation[] = "TruncateSequencePair";
|
||||
constexpr char kUnicodeCharTokenizerOperation[] = "UnicodeCharTokenizer";
|
||||
constexpr char kUnicodeScriptTokenizerOperation[] = "UnicodeScriptTokenizer";
|
||||
|
@ -294,6 +296,25 @@ class ToNumberOperation : public TensorOperation {
|
|||
DataType data_type_;
|
||||
};
|
||||
|
||||
class ToVectorsOperation : public TensorOperation {
|
||||
public:
|
||||
ToVectorsOperation(const std::shared_ptr<Vectors> &vectors, const std::vector<float> &unk_init,
|
||||
bool lower_case_backup);
|
||||
|
||||
~ToVectorsOperation();
|
||||
|
||||
std::shared_ptr<TensorOp> Build() override;
|
||||
|
||||
Status ValidateParams() override;
|
||||
|
||||
std::string Name() const override { return kToVectorsOperation; }
|
||||
|
||||
private:
|
||||
std::shared_ptr<Vectors> vectors_;
|
||||
std::vector<float> unk_init_;
|
||||
bool lower_case_backup_;
|
||||
};
|
||||
|
||||
class TruncateSequencePairOperation : public TensorOperation {
|
||||
public:
|
||||
explicit TruncateSequencePairOperation(int32_t max_length);
|
||||
|
|
|
@ -22,6 +22,7 @@ add_library(text-kernels OBJECT
|
|||
wordpiece_tokenizer_op.cc
|
||||
truncate_sequence_pair_op.cc
|
||||
to_number_op.cc
|
||||
to_vectors_op.cc
|
||||
sentence_piece_tokenizer_op.cc
|
||||
${ICU_DEPEND_FILES}
|
||||
)
|
||||
|
|
|
@ -0,0 +1,58 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "minddata/dataset/text/kernels/to_vectors_op.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
ToVectorsOp::ToVectorsOp(const std::shared_ptr<Vectors> &vectors, const std::vector<float> &unk_init,
|
||||
bool lower_case_backup)
|
||||
: vectors_(vectors), unk_init_(unk_init), lower_case_backup_(lower_case_backup) {}
|
||||
|
||||
Status ToVectorsOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
|
||||
IO_CHECK(input, output);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING, "ToVectors: input tensor type should be string.");
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(unk_init_.size() == 0 || unk_init_.size() == vectors_->Dim(),
|
||||
"ToVectors: unk_init must be the same length as vectors, but got unk_init: " +
|
||||
std::to_string(unk_init_.size()) + " and vectors: " + std::to_string(vectors_->Dim()));
|
||||
|
||||
std::vector<float> vectors_vec;
|
||||
int len = 0;
|
||||
for (auto itr = input->begin<std::string_view>(); itr != input->end<std::string_view>(); ++itr) {
|
||||
std::vector<float> vectors_value = vectors_->Lookup(std::string(*itr), unk_init_, lower_case_backup_);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(!vectors_value.empty(), "ToVectors: invalid data, token: \"" + std::string(*itr) +
|
||||
"\" doesn't exist in vectors and no unk_init is specified.");
|
||||
vectors_vec.insert(vectors_vec.end(), vectors_value.begin(), vectors_value.end());
|
||||
len++;
|
||||
}
|
||||
|
||||
int dim = static_cast<int>(vectors_vec.size() / len);
|
||||
if (vectors_vec.size() == dim) {
|
||||
RETURN_IF_NOT_OK(Tensor::CreateFromVector(vectors_vec, output));
|
||||
} else {
|
||||
RETURN_IF_NOT_OK(Tensor::CreateFromVector(vectors_vec, TensorShape({len, dim}), output));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status ToVectorsOp::OutputType(const std::vector<DataType> &inputs, std::vector<DataType> &outputs) {
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(inputs.size() == NumInput() && outputs.size() == NumOutput(),
|
||||
"ToVectors: input and output size don't match.");
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(inputs[0] == DataType::DE_STRING, "ToVectors: input tensor type should be string.");
|
||||
outputs[0] = DataType(DataType::DE_FLOAT32);
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,64 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_KERNELS_TO_VECTORS_OP_H_
|
||||
#define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_KERNELS_TO_VECTORS_OP_H_
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "minddata/dataset/core/tensor.h"
|
||||
#include "minddata/dataset/kernels/tensor_op.h"
|
||||
#include "minddata/dataset/text/vectors.h"
|
||||
#include "minddata/dataset/util/status.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
class ToVectorsOp : public TensorOp {
|
||||
public:
|
||||
/// \brief Constructor.
|
||||
/// \param[in] vectors Vectors used to lookup tokens.
|
||||
/// \param[in] unk_init Vector used to initialize OOV token.
|
||||
/// \param[in] lower_case_backup Whether to look up the token in the lower case.
|
||||
ToVectorsOp(const std::shared_ptr<Vectors> &vectors, const std::vector<float> &unk_init, bool lower_case_backup);
|
||||
|
||||
/// \brief Destructor.
|
||||
~ToVectorsOp() = default;
|
||||
|
||||
/// \brief Perform actual ToVectors on each tensor.
|
||||
/// \param[in] input Input tensor.
|
||||
/// \param[in] output Output tensor.
|
||||
/// \return[out] Status code.
|
||||
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
|
||||
|
||||
/// \param[in] inputs DataType of input tensor.
|
||||
/// \param[in] outputs DataType of output tensor.
|
||||
/// \return[out] Status code.
|
||||
Status OutputType(const std::vector<DataType> &inputs, std::vector<DataType> &outputs) override;
|
||||
|
||||
/// \brief Get Op name.
|
||||
std::string Name() const override { return kToVectorsOp; }
|
||||
|
||||
private:
|
||||
std::shared_ptr<Vectors> vectors_;
|
||||
std::vector<float> unk_init_;
|
||||
bool lower_case_backup_;
|
||||
};
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_KERNELS_TO_VECTORS_OP_H_
|
|
@ -0,0 +1,145 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "minddata/dataset/text/vectors.h"
|
||||
|
||||
#include "utils/file_utils.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
Status Vectors::InferShape(const std::string &path, int32_t max_vectors, int32_t *num_lines, int32_t *header_num_lines,
|
||||
int32_t *vector_dim) {
|
||||
RETURN_UNEXPECTED_IF_NULL(num_lines);
|
||||
RETURN_UNEXPECTED_IF_NULL(header_num_lines);
|
||||
RETURN_UNEXPECTED_IF_NULL(vector_dim);
|
||||
|
||||
std::ifstream file_reader;
|
||||
file_reader.open(path, std::ios::in);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(file_reader.is_open(), "Vectors: invalid file, failed to open vector file: " + path);
|
||||
|
||||
*num_lines = 0, *header_num_lines = 0, *vector_dim = -1;
|
||||
std::string line, row;
|
||||
while (std::getline(file_reader, line)) {
|
||||
if (*vector_dim == -1) {
|
||||
std::vector<std::string> vec;
|
||||
std::istringstream line_reader(line);
|
||||
while (std::getline(line_reader, row, ' ')) {
|
||||
vec.push_back(row);
|
||||
}
|
||||
// The number of rows and dimensions can be obtained directly from the information header.
|
||||
const int kInfoHeaderSize = 2;
|
||||
if (vec.size() == kInfoHeaderSize) {
|
||||
(*header_num_lines)++;
|
||||
} else {
|
||||
*vector_dim = vec.size() - 1;
|
||||
(*num_lines)++;
|
||||
}
|
||||
} else {
|
||||
(*num_lines)++;
|
||||
}
|
||||
}
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(*num_lines > 0, "Vectors: invalid file, file is empty.");
|
||||
|
||||
if (max_vectors > 0) {
|
||||
*num_lines = std::min(max_vectors, *num_lines); // Determine the true rows.
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Vectors::Load(const std::string &path, int32_t max_vectors,
|
||||
std::unordered_map<std::string, std::vector<float>> *map, int *vector_dim) {
|
||||
RETURN_UNEXPECTED_IF_NULL(map);
|
||||
RETURN_UNEXPECTED_IF_NULL(vector_dim);
|
||||
auto realpath = FileUtils::GetRealPath(common::SafeCStr(path));
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(realpath.has_value(), "Vectors: get real path failed, path: " + path);
|
||||
auto file_path = realpath.value();
|
||||
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(max_vectors >= 0,
|
||||
"Vectors: max_vectors must be non negative, but got: " + std::to_string(max_vectors));
|
||||
|
||||
int num_lines = 0, header_num_lines = 0;
|
||||
RETURN_IF_NOT_OK(InferShape(file_path, max_vectors, &num_lines, &header_num_lines, vector_dim));
|
||||
|
||||
std::fstream file_reader;
|
||||
file_reader.open(file_path, std::ios::in);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(file_reader.is_open(),
|
||||
"Vectors: invalid file, failed to open vector file: " + file_path);
|
||||
|
||||
while (header_num_lines > 0) {
|
||||
file_reader.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
|
||||
header_num_lines--;
|
||||
}
|
||||
|
||||
std::string line, token, vector_value;
|
||||
for (auto i = 0; i < num_lines; ++i) {
|
||||
std::getline(file_reader, line);
|
||||
std::istringstream line_reader(line);
|
||||
std::getline(line_reader, token, ' ');
|
||||
std::vector<float> vector_values;
|
||||
int dim = 0;
|
||||
while (line_reader >> vector_value) {
|
||||
dim++;
|
||||
vector_values.push_back(atof(vector_value.c_str()));
|
||||
}
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(dim > 1, "Vectors: token with 1-dimensional vector.");
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(dim == *vector_dim,
|
||||
"Vectors: all vectors must have the same number of dimensions, but got dim " +
|
||||
std::to_string(dim) + " while expecting " + std::to_string(*vector_dim));
|
||||
|
||||
auto token_index = map->find(token);
|
||||
if (token_index == map->end()) {
|
||||
(*map)[token] = vector_values;
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Vectors::Vectors(const std::unordered_map<std::string, std::vector<float>> &map, int dim) {
|
||||
map_ = std::move(map);
|
||||
dim_ = dim;
|
||||
}
|
||||
|
||||
Status Vectors::BuildFromFile(std::shared_ptr<Vectors> *vectors, const std::string &path, int32_t max_vectors) {
|
||||
std::unordered_map<std::string, std::vector<float>> map;
|
||||
int vector_dim = -1;
|
||||
RETURN_IF_NOT_OK(Load(path, max_vectors, &map, &vector_dim));
|
||||
*vectors = std::make_shared<Vectors>(std::move(map), vector_dim);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
std::vector<float> Vectors::Lookup(const std::string &token, const std::vector<float> &unk_init,
|
||||
bool lower_case_backup) {
|
||||
std::vector<float> init_vec(dim_, 0);
|
||||
if (!unk_init.empty()) {
|
||||
if (unk_init.size() != dim_) {
|
||||
MS_LOG(WARNING) << "Vectors: size of unk_init is not the same as vectors, will initialize with zero vectors.";
|
||||
} else {
|
||||
init_vec = unk_init;
|
||||
}
|
||||
}
|
||||
std::string lower_token = token;
|
||||
if (lower_case_backup) {
|
||||
transform(lower_token.begin(), lower_token.end(), lower_token.begin(), ::tolower);
|
||||
}
|
||||
auto str_index = map_.find(lower_token);
|
||||
if (str_index == map_.end()) {
|
||||
return init_vec;
|
||||
} else {
|
||||
return str_index->second;
|
||||
}
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,89 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_VECTORS_H_
|
||||
#define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_VECTORS_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <fstream>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "minddata/dataset/core/tensor.h"
|
||||
#include "minddata/dataset/include/dataset/iterator.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
/// \brief Pre-train word vectors.
|
||||
class Vectors {
|
||||
public:
|
||||
/// Constructor.
|
||||
Vectors() = default;
|
||||
|
||||
/// Constructor.
|
||||
/// \param[in] map A map between string and vector.
|
||||
/// \param[in] dim Dimension of the vectors.
|
||||
Vectors(const std::unordered_map<std::string, std::vector<float>> &map, int dim);
|
||||
|
||||
/// Destructor.
|
||||
virtual ~Vectors() = default;
|
||||
|
||||
/// \brief Build Vectors from reading a pre-train vector file.
|
||||
/// \param[out] vectors Vectors object which contains the pre-train vectors.
|
||||
/// \param[in] path Path to the pre-trained word vector file.
|
||||
/// \param[in] max_vectors This can be used to limit the number of pre-trained vectors loaded (default=0, no limit).
|
||||
static Status BuildFromFile(std::shared_ptr<Vectors> *vectors, const std::string &path, int32_t max_vectors = 0);
|
||||
|
||||
/// \brief Look up embedding vectors of token.
|
||||
/// \param[in] token A token to be looked up.
|
||||
/// \param[in] unk_init In case of the token is out-of-vectors (OOV), the result will be initialized with `unk_init`.
|
||||
/// (default={}, means to initialize with zero vectors).
|
||||
/// \param[in] lower_case_backup Whether to look up the token in the lower case (Default = false).
|
||||
/// \return The vector of the input token.
|
||||
virtual std::vector<float> Lookup(const std::string &token, const std::vector<float> &unk_init = {},
|
||||
bool lower_case_backup = false);
|
||||
|
||||
/// \brief Getter of dimension.
|
||||
const int &Dim() const { return dim_; }
|
||||
|
||||
protected:
|
||||
/// \brief Infer the shape of the pre-trained word vector file.
|
||||
/// \param[in] path Path to the pre-trained word vector file.
|
||||
/// \param[in] max_vectors Maximum number of pre-trained word vectors to be read.
|
||||
/// \param[out] num_lines The number of lines of the file.
|
||||
/// \param[out] header_num_lines The number of lines of file header.
|
||||
/// \param[out] vector_dim The dimension of the vectors in the file.
|
||||
static Status InferShape(const std::string &path, int32_t max_vectors, int32_t *num_lines, int32_t *header_num_lines,
|
||||
int32_t *vector_dim);
|
||||
|
||||
/// \brief Load map from reading a pre-train vector file.
|
||||
/// \param[in] path Path to the pre-trained word vector file.
|
||||
/// \param[in] max_vectors This can be used to limit the number of pre-trained vectors loaded, must be non negative.
|
||||
/// \param[out] map The map between words and vectors.
|
||||
/// \param[out] vector_dim The dimension of the vectors in the file.
|
||||
static Status Load(const std::string &path, int32_t max_vectors,
|
||||
std::unordered_map<std::string, std::vector<float>> *map, int *vector_dim);
|
||||
|
||||
int dim_;
|
||||
std::unordered_map<std::string, std::vector<float>> map_;
|
||||
};
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_VECTORS_H_
|
|
@ -26,15 +26,15 @@ Common imported modules in corresponding API examples are as follows:
|
|||
"""
|
||||
import platform
|
||||
from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer, \
|
||||
TruncateSequencePair, ToNumber, SlidingWindow, SentencePieceTokenizer, PythonTokenizer
|
||||
TruncateSequencePair, ToNumber, SlidingWindow, SentencePieceTokenizer, PythonTokenizer, ToVectors
|
||||
from .utils import to_str, to_bytes, JiebaMode, Vocab, NormalizeForm, SentencePieceVocab, SentencePieceModel, \
|
||||
SPieceTokenizerOutType, SPieceTokenizerLoadType
|
||||
SPieceTokenizerOutType, SPieceTokenizerLoadType, Vectors
|
||||
|
||||
__all__ = [
|
||||
"Lookup", "JiebaTokenizer", "UnicodeCharTokenizer", "Ngram",
|
||||
"to_str", "to_bytes", "Vocab", "WordpieceTokenizer", "TruncateSequencePair", "ToNumber",
|
||||
"PythonTokenizer", "SlidingWindow", "SentencePieceVocab", "SentencePieceTokenizer", "SPieceTokenizerOutType",
|
||||
"SentencePieceModel", "SPieceTokenizerLoadType", "JiebaMode", "NormalizeForm",
|
||||
"SentencePieceModel", "SPieceTokenizerLoadType", "JiebaMode", "NormalizeForm", "Vectors", "ToVectors"
|
||||
]
|
||||
|
||||
if platform.system().lower() != 'windows':
|
||||
|
|
|
@ -48,7 +48,7 @@ import mindspore._c_dataengine as cde
|
|||
from mindspore.common import dtype as mstype
|
||||
|
||||
from .utils import JiebaMode, NormalizeForm, to_str, SPieceTokenizerOutType, SPieceTokenizerLoadType
|
||||
from .validators import check_lookup, check_jieba_add_dict, \
|
||||
from .validators import check_lookup, check_jieba_add_dict, check_to_vectors, \
|
||||
check_jieba_add_word, check_jieba_init, check_with_offsets, check_unicode_script_tokenizer, \
|
||||
check_wordpiece_tokenizer, check_regex_replace, check_regex_tokenizer, check_basic_tokenizer, check_ngram, \
|
||||
check_pair_truncate, check_to_number, check_bert_tokenizer, check_python_tokenizer, check_slidingwindow, \
|
||||
|
@ -345,6 +345,7 @@ class SentencePieceTokenizer(TextTensorOperation):
|
|||
>>> tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
|
||||
>>> text_file_dataset = text_file_dataset.map(operations=tokenizer)
|
||||
"""
|
||||
|
||||
@check_sentence_piece_tokenizer
|
||||
def __init__(self, mode, out_type):
|
||||
self.mode = mode
|
||||
|
@ -421,6 +422,36 @@ class ToNumber(TextTensorOperation):
|
|||
return cde.ToNumberOperation(self.data_type)
|
||||
|
||||
|
||||
class ToVectors(TextTensorOperation):
|
||||
"""
|
||||
Look up a token into vectors according to the input vector table.
|
||||
|
||||
Args:
|
||||
vectors (Vectors): A vectors object.
|
||||
unk_init (sequence, optional): Sequence used to initialize out-of-vectors (OOV) token
|
||||
(default=None, initialize with zero vectors).
|
||||
lower_case_backup (bool, optional): Whether to look up the token in the lower case. If False, each token in the
|
||||
original case will be looked up; if True, each token in the original case will be looked up first, if not
|
||||
found in the keys of the property stoi, the token in the lower case will be looked up (default=False).
|
||||
|
||||
Examples:
|
||||
>>> # Load vectors from file
|
||||
>>> vectors = text.Vectors.from_file("/path/to/vectors/file")
|
||||
>>> # Use ToVectors operator to map tokens to vectors
|
||||
>>> to_vectors = text.ToVectors(vectors)
|
||||
>>> text_file_dataset = text_file_dataset.map(operations=[to_vectors])
|
||||
"""
|
||||
|
||||
@check_to_vectors
|
||||
def __init__(self, vectors, unk_init=None, lower_case_backup=False):
|
||||
self.vectors = vectors
|
||||
self.unk_init = unk_init if unk_init is not None else []
|
||||
self.lower_case_backup = lower_case_backup
|
||||
|
||||
def parse(self):
|
||||
return cde.ToVectorsOperation(self.vectors, self.unk_init, self.lower_case_backup)
|
||||
|
||||
|
||||
class TruncateSequencePair(TextTensorOperation):
|
||||
"""
|
||||
Truncate a pair of rank-1 tensors such that the total length is less than max_length.
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
# Copyright 2020-2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
|
@ -16,16 +16,18 @@ The module text.utils provides some general methods for NLP text processing.
|
|||
For example, you can use Vocab to build a dictionary,
|
||||
use to_bytes and to_str to encode and decode strings into a specified format.
|
||||
"""
|
||||
|
||||
from enum import IntEnum
|
||||
|
||||
import numpy as np
|
||||
import mindspore._c_dataengine as cde
|
||||
|
||||
import mindspore._c_dataengine as cde
|
||||
from .validators import check_from_file, check_from_list, check_from_dict, check_from_dataset, \
|
||||
check_from_dataset_sentencepiece, check_from_file_sentencepiece, check_save_model
|
||||
check_from_dataset_sentencepiece, check_from_file_sentencepiece, check_save_model, \
|
||||
check_from_file_vectors
|
||||
|
||||
__all__ = [
|
||||
"Vocab", "SentencePieceVocab", "to_str", "to_bytes"
|
||||
"Vocab", "SentencePieceVocab", "to_str", "to_bytes", "Vectors"
|
||||
]
|
||||
|
||||
|
||||
|
@ -383,3 +385,29 @@ class SPieceTokenizerLoadType(IntEnum):
|
|||
"""
|
||||
FILE = 0
|
||||
MODEL = 1
|
||||
|
||||
|
||||
class Vectors(cde.Vectors):
|
||||
"""
|
||||
Vectors object that is used to map tokens into vectors.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
@check_from_file_vectors
|
||||
def from_file(cls, file_path, max_vectors=None):
|
||||
"""
|
||||
Build a vector from a file.
|
||||
|
||||
Args:
|
||||
file_path (str): Path of the file that contains the vectors.
|
||||
max_vectors (int, optional): This can be used to limit the number of pre-trained vectors loaded.
|
||||
Most pre-trained vector sets are sorted in the descending order of word frequency. Thus, in
|
||||
situations where the entire set doesn’t fit in memory, or is not needed for another reason,
|
||||
passing max_vectors can limit the size of the loaded set (default=None, no limit).
|
||||
|
||||
Examples:
|
||||
>>> vector = text.Vectors.from_file("/path/to/vectors/file", max_vectors=None)
|
||||
"""
|
||||
|
||||
max_vectors = max_vectors if max_vectors is not None else 0
|
||||
return super().from_file(file_path, max_vectors)
|
||||
|
|
|
@ -15,15 +15,14 @@
|
|||
"""
|
||||
validators for text ops
|
||||
"""
|
||||
|
||||
from functools import wraps
|
||||
import mindspore.common.dtype as mstype
|
||||
|
||||
import mindspore._c_dataengine as cde
|
||||
import mindspore.common.dtype as mstype
|
||||
from mindspore._c_expression import typing
|
||||
|
||||
from ..core.validator_helpers import parse_user_args, type_check, type_check_list, check_uint32, \
|
||||
INT32_MAX, check_value, check_positive, check_pos_int32
|
||||
INT32_MAX, check_value, check_positive, check_pos_int32, check_filename, check_non_negative_int32
|
||||
|
||||
|
||||
def check_unique_list_of_words(words, arg_name):
|
||||
|
@ -532,3 +531,39 @@ def check_sentence_piece_tokenizer(method):
|
|||
return method(self, *args, **kwargs)
|
||||
|
||||
return new_method
|
||||
|
||||
|
||||
def check_from_file_vectors(method):
|
||||
"""A wrapper that wraps a parameter checker to from_file of class Vectors."""
|
||||
|
||||
@wraps(method)
|
||||
def new_method(self, *args, **kwargs):
|
||||
[file_path, max_vectors], _ = parse_user_args(method, *args, **kwargs)
|
||||
|
||||
type_check(file_path, (str,), "file_path")
|
||||
check_filename(file_path)
|
||||
if max_vectors is not None:
|
||||
type_check(max_vectors, (int,), "max_vectors")
|
||||
check_non_negative_int32(max_vectors, "max_vectors")
|
||||
|
||||
return method(self, *args, **kwargs)
|
||||
|
||||
return new_method
|
||||
|
||||
|
||||
def check_to_vectors(method):
|
||||
"""A wrapper that wraps a parameter checker to ToVectors."""
|
||||
|
||||
@wraps(method)
|
||||
def new_method(self, *args, **kwargs):
|
||||
[vectors, unk_init, lower_case_backup], _ = parse_user_args(method, *args, **kwargs)
|
||||
|
||||
type_check(vectors, (cde.Vectors,), "vectors")
|
||||
if unk_init is not None:
|
||||
type_check(unk_init, (list, tuple), "unk_init")
|
||||
for i, value in enumerate(unk_init):
|
||||
type_check(value, (int, float), "unk_init[{0}]".format(i))
|
||||
type_check(lower_case_backup, (bool,), "lower_case_backup")
|
||||
return method(self, *args, **kwargs)
|
||||
|
||||
return new_method
|
||||
|
|
|
@ -52,6 +52,7 @@ SET(DE_UT_SRCS
|
|||
c_api_samplers_test.cc
|
||||
c_api_text_sentence_piece_vocab_test.cc
|
||||
c_api_text_vocab_test.cc
|
||||
c_api_text_test.cc
|
||||
c_api_transforms_test.cc
|
||||
c_api_vision_a_to_q_test.cc
|
||||
c_api_vision_affine_test.cc
|
||||
|
|
|
@ -14,8 +14,8 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "common/common.h"
|
||||
#include "include/api/status.h"
|
||||
|
@ -23,12 +23,14 @@
|
|||
#include "minddata/dataset/include/dataset/datasets.h"
|
||||
#include "minddata/dataset/include/dataset/text.h"
|
||||
#include "minddata/dataset/include/dataset/transforms.h"
|
||||
#include "minddata/dataset/text/vectors.h"
|
||||
#include "minddata/dataset/text/vocab.h"
|
||||
|
||||
using namespace mindspore::dataset;
|
||||
using mindspore::Status;
|
||||
using mindspore::dataset::ShuffleMode;
|
||||
using mindspore::dataset::Tensor;
|
||||
using mindspore::dataset::Vectors;
|
||||
using mindspore::dataset::Vocab;
|
||||
|
||||
class MindDataTestPipeline : public UT::DatasetOpTesting {
|
||||
|
@ -892,7 +894,7 @@ TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess2) {
|
|||
std::vector<std::string> expected_tokens = {"今天天气", "太好了", "我们", "一起", "去", "外面", "玩吧"};
|
||||
std::vector<uint32_t> expected_offsets_start = {0, 12, 21, 27, 33, 36, 42};
|
||||
std::vector<uint32_t> expected_offsets_limit = {12, 21, 27, 33, 36, 42, 48};
|
||||
|
||||
|
||||
std::shared_ptr<Tensor> de_expected_tokens;
|
||||
ASSERT_OK(Tensor::CreateFromVector(expected_tokens, &de_expected_tokens));
|
||||
mindspore::MSTensor ms_expected_tokens =
|
||||
|
@ -1596,7 +1598,8 @@ TEST_F(MindDataTestPipeline, TestToNumberSuccess2) {
|
|||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create ToNumber operation on ds
|
||||
std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeFloat64);
|
||||
std::shared_ptr<TensorTransform> to_number =
|
||||
std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeFloat64);
|
||||
EXPECT_NE(to_number, nullptr);
|
||||
|
||||
// Create a Map operation on ds
|
||||
|
@ -3543,3 +3546,400 @@ TEST_F(MindDataTestPipeline, TestWhitespaceTokenizerSuccess1) {
|
|||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
/// Feature: Vectors
|
||||
/// Description: test with default parameter in function BuildFromFile and function Lookup
|
||||
/// Expectation: return correct MSTensor which is equal to the expected
|
||||
TEST_F(MindDataTestPipeline, TestVectorsDefaultParam) {
|
||||
// Test with default parameter.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsDefaultParam.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
|
||||
std::shared_ptr<Vectors> vectors;
|
||||
Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors);
|
||||
EXPECT_NE(lookup, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({lookup}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, mindspore::MSTensor> row;
|
||||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
|
||||
uint64_t i = 0;
|
||||
std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
|
||||
{0, 0, 0, 0, 0, 0},
|
||||
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
|
||||
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
|
||||
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
|
||||
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
|
||||
{0, 0, 0, 0, 0, 0}};
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
MS_LOG(INFO) << ind.Shape();
|
||||
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
|
||||
TensorPtr de_expected_item;
|
||||
dsize_t dim = 6;
|
||||
ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
|
||||
mindspore::MSTensor ms_expected_item =
|
||||
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
|
||||
EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
|
||||
|
||||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 7);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
/// Feature: Vectors
|
||||
/// Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile
|
||||
/// Expectation: return correct MSTensor which is equal to the expected
|
||||
TEST_F(MindDataTestPipeline, TestVectorsAllBuildfromfileParams) {
|
||||
// Test with two parameters.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsAllBuildfromfileParams.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
|
||||
std::shared_ptr<Vectors> vectors;
|
||||
Status s = Vectors::BuildFromFile(&vectors, vectors_dir, 100);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors);
|
||||
EXPECT_NE(lookup, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({lookup}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, mindspore::MSTensor> row;
|
||||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
|
||||
uint64_t i = 0;
|
||||
std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
|
||||
{0, 0, 0, 0, 0, 0},
|
||||
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
|
||||
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
|
||||
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
|
||||
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
|
||||
{0, 0, 0, 0, 0, 0}};
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
MS_LOG(INFO) << ind.Shape();
|
||||
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
|
||||
TensorPtr de_expected_item;
|
||||
dsize_t dim = 6;
|
||||
ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
|
||||
mindspore::MSTensor ms_expected_item =
|
||||
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
|
||||
EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
|
||||
|
||||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 7);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
/// Feature: Vectors
|
||||
/// Description: test with all parameters in function BuildFromFile and `unknown_init` in function Lookup
|
||||
/// Expectation: return correct MSTensor which is equal to the expected
|
||||
TEST_F(MindDataTestPipeline, TestVectorsUnknownInit) {
|
||||
// Test with two parameters.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsUnknownInit.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
|
||||
std::shared_ptr<Vectors> vectors;
|
||||
Status s = Vectors::BuildFromFile(&vectors, vectors_dir, 100);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
|
||||
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors, unknown_init);
|
||||
EXPECT_NE(lookup, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({lookup}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, mindspore::MSTensor> row;
|
||||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
|
||||
uint64_t i = 0;
|
||||
std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
|
||||
{-1, -1, -1, -1, -1, -1},
|
||||
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
|
||||
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
|
||||
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
|
||||
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
|
||||
{-1, -1, -1, -1, -1, -1}};
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
MS_LOG(INFO) << ind.Shape();
|
||||
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
|
||||
TensorPtr de_expected_item;
|
||||
dsize_t dim = 6;
|
||||
ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
|
||||
mindspore::MSTensor ms_expected_item =
|
||||
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
|
||||
EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
|
||||
|
||||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 7);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
/// Feature: Vectors
|
||||
/// Description: test with all parameters which include `path` and `max_vectors` in function BuildFromFile and `token`,
|
||||
/// `unknown_init` and `lower_case_backup` in function Lookup. But some tokens have some big letters
|
||||
/// Expectation: return correct MSTensor which is equal to the expected
|
||||
TEST_F(MindDataTestPipeline, TestVectorsAllParams) {
|
||||
// Test with all parameters.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsAllParams.";
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
|
||||
std::shared_ptr<Vectors> vectors;
|
||||
Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
|
||||
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors, unknown_init, true);
|
||||
EXPECT_NE(lookup, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({lookup}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, mindspore::MSTensor> row;
|
||||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
|
||||
uint64_t i = 0;
|
||||
std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
|
||||
{-1, -1, -1, -1, -1, -1},
|
||||
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
|
||||
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
|
||||
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
|
||||
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
|
||||
{-1, -1, -1, -1, -1, -1}};
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
MS_LOG(INFO) << ind.Shape();
|
||||
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
|
||||
TensorPtr de_expected_item;
|
||||
dsize_t dim = 6;
|
||||
ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
|
||||
mindspore::MSTensor ms_expected_item =
|
||||
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
|
||||
EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
|
||||
|
||||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 7);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
/// Feature: Vectors
|
||||
/// Description: test with pre-vectors set that have the different dimension
|
||||
/// Expectation: throw correct error and message
|
||||
TEST_F(MindDataTestPipeline, TestVectorsDifferentDimension) {
|
||||
// Tokens don't have the same number of vectors.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsDifferentDimension.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_dim_different.txt";
|
||||
std::shared_ptr<Vectors> vectors;
|
||||
Status s = Vectors::BuildFromFile(&vectors, vectors_dir, 100);
|
||||
EXPECT_NE(s, Status::OK());
|
||||
}
|
||||
|
||||
/// Feature: Vectors
|
||||
/// Description: test with pre-vectors set that has the head-info
|
||||
/// Expectation: return correct MSTensor which is equal to the expected
|
||||
TEST_F(MindDataTestPipeline, TestVectorsWithHeadInfo) {
|
||||
// Test with words that has head info.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithHeadInfo.";
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_with_info.txt";
|
||||
std::shared_ptr<Vectors> vectors;
|
||||
Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
|
||||
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors, unknown_init, true);
|
||||
EXPECT_NE(lookup, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({lookup}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, mindspore::MSTensor> row;
|
||||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
|
||||
uint64_t i = 0;
|
||||
std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
|
||||
{-1, -1, -1, -1, -1, -1},
|
||||
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
|
||||
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
|
||||
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
|
||||
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
|
||||
{-1, -1, -1, -1, -1, -1}};
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
MS_LOG(INFO) << ind.Shape();
|
||||
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
|
||||
TensorPtr de_expected_item;
|
||||
dsize_t dim = 6;
|
||||
ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
|
||||
mindspore::MSTensor ms_expected_item =
|
||||
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
|
||||
EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
|
||||
|
||||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 7);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
/// Feature: Vectors
|
||||
/// Description: test with the parameter max_vectors that is <= 0
|
||||
/// Expectation: throw correct error and message
|
||||
TEST_F(MindDataTestPipeline, TestVectorsMaxVectorsLessThanZero) {
|
||||
// Test with max_vectors <= 0.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsMaxVectorsLessThanZero.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
|
||||
std::shared_ptr<Vectors> vectors;
|
||||
Status s = Vectors::BuildFromFile(&vectors, vectors_dir, -1);
|
||||
EXPECT_NE(s, Status::OK());
|
||||
}
|
||||
|
||||
/// Feature: Vectors
|
||||
/// Description: test with the pre-vectors file that is empty
|
||||
/// Expectation: throw correct error and message
|
||||
TEST_F(MindDataTestPipeline, TestVectorsWithEmptyFile) {
|
||||
// Read empty file.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithEmptyFile.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_empty.txt";
|
||||
std::shared_ptr<Vectors> vectors;
|
||||
Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
|
||||
EXPECT_NE(s, Status::OK());
|
||||
}
|
||||
|
||||
/// Feature: Vectors
|
||||
/// Description: test with the pre-vectors file that is not exist
|
||||
/// Expectation: throw correct error and message
|
||||
TEST_F(MindDataTestPipeline, TestVectorsWithNotExistFile) {
|
||||
// Test with not exist file.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithNotExistFile.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
std::string vectors_dir = datasets_root_path_ + "/testVectors/no_vectors.txt";
|
||||
std::shared_ptr<Vectors> vectors;
|
||||
Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
|
||||
EXPECT_NE(s, Status::OK());
|
||||
}
|
||||
|
||||
/// Feature: Vectors
|
||||
/// Description: test with the pre-vectors set that has a situation that info-head is not the first line in the set
|
||||
/// Expectation: throw correct error and message
|
||||
TEST_F(MindDataTestPipeline, TestVectorsWithWrongInfoFile) {
|
||||
// wrong info.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithWrongInfoFile.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_with_wrong_info.txt";
|
||||
std::shared_ptr<Vectors> vectors;
|
||||
Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
|
||||
EXPECT_NE(s, Status::OK());
|
||||
}
|
||||
|
|
|
@ -23,10 +23,12 @@
|
|||
#include "minddata/dataset/include/dataset/vision.h"
|
||||
#include "minddata/dataset/include/dataset/audio.h"
|
||||
#include "minddata/dataset/include/dataset/text.h"
|
||||
#include "minddata/dataset/text/vectors.h"
|
||||
#include "utils/log_adapter.h"
|
||||
|
||||
using namespace mindspore::dataset;
|
||||
using mindspore::LogStream;
|
||||
using mindspore::dataset::Vectors;
|
||||
using mindspore::ExceptionType::NoExceptionType;
|
||||
using mindspore::MsLogLevel::INFO;
|
||||
|
||||
|
@ -1529,6 +1531,140 @@ TEST_F(MindDataTestExecute, TestFlangerWithWrongArg) {
|
|||
EXPECT_FALSE(s01.IsOk());
|
||||
}
|
||||
|
||||
/// Feature: Vectors
|
||||
/// Description: test basic usage of Vectors and the ToVectors with default parameter
|
||||
/// Expectation: get correct MSTensor
|
||||
TEST_F(MindDataTestExecute, TestVectorsParam) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestExecute-TestVectorsParam.";
|
||||
std::shared_ptr<Tensor> de_tensor;
|
||||
Tensor::CreateScalar<std::string>("ok", &de_tensor);
|
||||
auto token = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor));
|
||||
mindspore::MSTensor lookup_result;
|
||||
|
||||
// Create expected output.
|
||||
std::shared_ptr<Tensor> de_expected;
|
||||
std::vector<float> expected = {0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411};
|
||||
dsize_t dim = 6;
|
||||
ASSERT_OK(Tensor::CreateFromVector(expected, TensorShape({dim}), &de_expected));
|
||||
auto ms_expected = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected));
|
||||
|
||||
// Transform params.
|
||||
std::string vectors_dir = "data/dataset/testVectors/vectors.txt";
|
||||
std::shared_ptr<Vectors> vectors01;
|
||||
Status s01 = Vectors::BuildFromFile(&vectors01, vectors_dir);
|
||||
EXPECT_EQ(s01, Status::OK());
|
||||
std::shared_ptr<TensorTransform> to_vectors01 = std::make_shared<text::ToVectors>(vectors01);
|
||||
auto transform01 = Execute({to_vectors01});
|
||||
Status status01 = transform01(token, &lookup_result);
|
||||
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected);
|
||||
EXPECT_TRUE(status01.IsOk());
|
||||
|
||||
std::shared_ptr<Vectors> vectors02;
|
||||
Status s02 = Vectors::BuildFromFile(&vectors02, vectors_dir, 100);
|
||||
EXPECT_EQ(s02, Status::OK());
|
||||
std::shared_ptr<TensorTransform> to_vectors02 = std::make_shared<text::ToVectors>(vectors02);
|
||||
auto transform02 = Execute({to_vectors02});
|
||||
Status status02 = transform02(token, &lookup_result);
|
||||
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected);
|
||||
EXPECT_TRUE(status02.IsOk());
|
||||
|
||||
std::shared_ptr<Vectors> vectors03;
|
||||
Status s03 = Vectors::BuildFromFile(&vectors03, vectors_dir, 3);
|
||||
EXPECT_EQ(s03, Status::OK());
|
||||
std::shared_ptr<TensorTransform> to_vectors03 = std::make_shared<text::ToVectors>(vectors03);
|
||||
auto transform03 = Execute({to_vectors03});
|
||||
Status status03 = transform03(token, &lookup_result);
|
||||
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected);
|
||||
EXPECT_TRUE(status03.IsOk());
|
||||
}
|
||||
|
||||
/// Feature: ToVectors
|
||||
/// Description: test basic usage of ToVectors and the Vectors with default parameter
|
||||
/// Expectation: get correct MSTensor
|
||||
TEST_F(MindDataTestExecute, TestToVectorsParam) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestExecute-TestToVectorsParam.";
|
||||
std::shared_ptr<Tensor> de_tensor01;
|
||||
Tensor::CreateScalar<std::string>("none", &de_tensor01);
|
||||
auto token01 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor01));
|
||||
std::shared_ptr<Tensor> de_tensor02;
|
||||
Tensor::CreateScalar<std::string>("ok", &de_tensor02);
|
||||
auto token02 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor02));
|
||||
std::shared_ptr<Tensor> de_tensor03;
|
||||
Tensor::CreateScalar<std::string>("OK", &de_tensor03);
|
||||
auto token03 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor03));
|
||||
mindspore::MSTensor lookup_result;
|
||||
|
||||
// Create expected output.
|
||||
dsize_t dim = 6;
|
||||
std::shared_ptr<Tensor> de_expected01;
|
||||
std::vector<float> expected01 = {0, 0, 0, 0, 0, 0};
|
||||
ASSERT_OK(Tensor::CreateFromVector(expected01, TensorShape({dim}), &de_expected01));
|
||||
auto ms_expected01 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected01));
|
||||
std::shared_ptr<Tensor> de_expected02;
|
||||
std::vector<float> expected02 = {-1, -1, -1, -1, -1, -1};
|
||||
ASSERT_OK(Tensor::CreateFromVector(expected02, TensorShape({dim}), &de_expected02));
|
||||
auto ms_expected02 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected02));
|
||||
std::shared_ptr<Tensor> de_expected03;
|
||||
std::vector<float> expected03 = {0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411};
|
||||
ASSERT_OK(Tensor::CreateFromVector(expected03, TensorShape({dim}), &de_expected03));
|
||||
auto ms_expected03 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected03));
|
||||
|
||||
// Transform params.
|
||||
std::string vectors_dir = "data/dataset/testVectors/vectors.txt";
|
||||
std::shared_ptr<Vectors> vectors;
|
||||
Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
std::shared_ptr<TensorTransform> to_vectors01 = std::make_shared<text::ToVectors>(vectors);
|
||||
auto transform01 = Execute({to_vectors01});
|
||||
Status status01 = transform01(token01, &lookup_result);
|
||||
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected01);
|
||||
EXPECT_TRUE(status01.IsOk());
|
||||
std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
|
||||
std::shared_ptr<TensorTransform> to_vectors02 = std::make_shared<text::ToVectors>(vectors, unknown_init);
|
||||
auto transform02 = Execute({to_vectors02});
|
||||
Status status02 = transform02(token01, &lookup_result);
|
||||
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected02);
|
||||
EXPECT_TRUE(status02.IsOk());
|
||||
std::shared_ptr<TensorTransform> to_vectors03 = std::make_shared<text::ToVectors>(vectors, unknown_init);
|
||||
auto transform03 = Execute({to_vectors03});
|
||||
Status status03 = transform03(token02, &lookup_result);
|
||||
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected03);
|
||||
EXPECT_TRUE(status03.IsOk());
|
||||
std::shared_ptr<TensorTransform> to_vectors04 = std::make_shared<text::ToVectors>(vectors, unknown_init, true);
|
||||
auto transform04 = Execute({to_vectors04});
|
||||
Status status04 = transform04(token03, &lookup_result);
|
||||
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected03);
|
||||
EXPECT_TRUE(status04.IsOk());
|
||||
}
|
||||
|
||||
/// Feature: ToVectors
|
||||
/// Description: test invalid parameter of ToVectors
|
||||
/// Expectation: throw exception correctly
|
||||
TEST_F(MindDataTestExecute, TestToVectorsWithInvalidParam) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestExecute-TestToVectorsWithInvalidParam.";
|
||||
std::shared_ptr<Tensor> de_tensor;
|
||||
Tensor::CreateScalar<std::string>("none", &de_tensor);
|
||||
auto token = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor));
|
||||
mindspore::MSTensor lookup_result;
|
||||
|
||||
// Transform params.
|
||||
std::string vectors_dir = "data/dataset/testVectors/vectors.txt";
|
||||
std::shared_ptr<Vectors> vectors01;
|
||||
Status s = Vectors::BuildFromFile(&vectors01, vectors_dir);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
std::vector<float> unknown_init = {-1, -1, -1, -1};
|
||||
std::shared_ptr<TensorTransform> to_vectors01 = std::make_shared<text::ToVectors>(vectors01, unknown_init);
|
||||
auto transform01 = Execute({to_vectors01});
|
||||
Status status01 = transform01(token, &lookup_result);
|
||||
EXPECT_FALSE(status01.IsOk());
|
||||
std::shared_ptr<Vectors> vectors02 = nullptr;
|
||||
std::shared_ptr<TensorTransform> to_vectors02 = std::make_shared<text::ToVectors>(vectors02);
|
||||
auto transform02 = Execute({to_vectors02});
|
||||
Status status02 = transform02(token, &lookup_result);
|
||||
EXPECT_FALSE(status02.IsOk());
|
||||
}
|
||||
|
||||
// Feature: DBToAmplitude
|
||||
// Description: test DBToAmplitude in eager mode
|
||||
// Expectation: the data is processed successfully
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
ok 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445718411
|
||||
! 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709
|
||||
this 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973
|
||||
is 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603
|
||||
my 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246
|
||||
home 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923
|
|
@ -0,0 +1,6 @@
|
|||
ok 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445718411
|
||||
! 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709
|
||||
this 0.15164 0.30177 -0.16763 0.17684 0.31719
|
||||
is 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603
|
||||
my 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246
|
||||
home 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923
|
|
@ -0,0 +1,7 @@
|
|||
6 6
|
||||
ok 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445718411
|
||||
! 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709
|
||||
this 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973
|
||||
is 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603
|
||||
my 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246
|
||||
home 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923
|
|
@ -0,0 +1,7 @@
|
|||
the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445718411
|
||||
, 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709
|
||||
. 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973
|
||||
6 6
|
||||
of 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603
|
||||
to 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246
|
||||
and 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923
|
|
@ -0,0 +1,7 @@
|
|||
ok
|
||||
.
|
||||
this
|
||||
is
|
||||
my
|
||||
home
|
||||
.
|
|
@ -0,0 +1,7 @@
|
|||
ok
|
||||
!
|
||||
This
|
||||
iS
|
||||
my
|
||||
HOME
|
||||
.
|
|
@ -0,0 +1,236 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from mindspore import log
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.text as text
|
||||
import mindspore.dataset.text.transforms as T
|
||||
|
||||
DATASET_ROOT_PATH = "../data/dataset/testVectors/"
|
||||
|
||||
|
||||
def test_vectors_all_tovectors_params_eager():
|
||||
"""
|
||||
Feature: Vectors
|
||||
Description: test with all parameters which include `unk_init`
|
||||
and `lower_case_backup` in function ToVectors in eager mode
|
||||
Expectation: output is equal to the expected value
|
||||
"""
|
||||
vectors = text.Vectors.from_file(DATASET_ROOT_PATH + "vectors.txt", max_vectors=4)
|
||||
myUnk = [-1, -1, -1, -1, -1, -1]
|
||||
to_vectors = T.ToVectors(vectors, unk_init=myUnk, lower_case_backup=True)
|
||||
result1 = to_vectors("Ok")
|
||||
result2 = to_vectors("!")
|
||||
result3 = to_vectors("This")
|
||||
result4 = to_vectors("is")
|
||||
result5 = to_vectors("my")
|
||||
result6 = to_vectors("home")
|
||||
result7 = to_vectors("none")
|
||||
res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
|
||||
[0.013441, 0.23682, -0.16899, 0.40951, 0.63812, 0.47709],
|
||||
[0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
|
||||
[0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
|
||||
[-1, -1, -1, -1, -1, -1],
|
||||
[-1, -1, -1, -1, -1, -1],
|
||||
[-1, -1, -1, -1, -1, -1]]
|
||||
res_array = np.array(res, dtype=np.float32)
|
||||
|
||||
assert np.array_equal(result1, res_array[0])
|
||||
assert np.array_equal(result2, res_array[1])
|
||||
assert np.array_equal(result3, res_array[2])
|
||||
assert np.array_equal(result4, res_array[3])
|
||||
assert np.array_equal(result5, res_array[4])
|
||||
assert np.array_equal(result6, res_array[5])
|
||||
assert np.array_equal(result7, res_array[6])
|
||||
|
||||
|
||||
def test_vectors_from_file():
|
||||
"""
|
||||
Feature: Vectors
|
||||
Description: test with only default parameter
|
||||
Expectation: output is equal to the expected value
|
||||
"""
|
||||
vectors = text.Vectors.from_file(DATASET_ROOT_PATH + "vectors.txt")
|
||||
to_vectors = text.ToVectors(vectors)
|
||||
data = ds.TextFileDataset(DATASET_ROOT_PATH + "words.txt", shuffle=False)
|
||||
data = data.map(operations=to_vectors, input_columns=["text"])
|
||||
ind = 0
|
||||
res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
|
||||
[0, 0, 0, 0, 0, 0],
|
||||
[0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
|
||||
[0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
|
||||
[0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246],
|
||||
[0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923],
|
||||
[0, 0, 0, 0, 0, 0]]
|
||||
for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
|
||||
res_array = np.array(res[ind], dtype=np.float32)
|
||||
assert np.array_equal(res_array, d["text"]), ind
|
||||
ind += 1
|
||||
|
||||
|
||||
def test_vectors_from_file_all_buildfromfile_params():
|
||||
"""
|
||||
Feature: Vectors
|
||||
Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile
|
||||
Expectation: output is equal to the expected value
|
||||
"""
|
||||
vectors = text.Vectors.from_file(DATASET_ROOT_PATH + "vectors.txt", max_vectors=100)
|
||||
to_vectors = text.ToVectors(vectors)
|
||||
data = ds.TextFileDataset(DATASET_ROOT_PATH + "words.txt", shuffle=False)
|
||||
data = data.map(operations=to_vectors, input_columns=["text"])
|
||||
ind = 0
|
||||
res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
|
||||
[0, 0, 0, 0, 0, 0],
|
||||
[0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
|
||||
[0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
|
||||
[0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246],
|
||||
[0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923],
|
||||
[0, 0, 0, 0, 0, 0]]
|
||||
print(data)
|
||||
for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
|
||||
res_array = np.array(res[ind], dtype=np.float32)
|
||||
assert np.array_equal(res_array, d["text"]), ind
|
||||
ind += 1
|
||||
|
||||
|
||||
def test_vectors_from_file_all_buildfromfile_params_eager():
|
||||
"""
|
||||
Feature: Vectors
|
||||
Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile in eager mode
|
||||
Expectation: output is equal to the expected value
|
||||
"""
|
||||
vectors = text.Vectors.from_file(DATASET_ROOT_PATH + "vectors.txt", max_vectors=4)
|
||||
to_vectors = T.ToVectors(vectors)
|
||||
result1 = to_vectors("ok")
|
||||
result2 = to_vectors("!")
|
||||
result3 = to_vectors("this")
|
||||
result4 = to_vectors("is")
|
||||
result5 = to_vectors("my")
|
||||
result6 = to_vectors("home")
|
||||
result7 = to_vectors("none")
|
||||
res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
|
||||
[0.013441, 0.23682, -0.16899, 0.40951, 0.63812, 0.47709],
|
||||
[0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
|
||||
[0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
|
||||
[0, 0, 0, 0, 0, 0],
|
||||
[0, 0, 0, 0, 0, 0],
|
||||
[0, 0, 0, 0, 0, 0]]
|
||||
res_array = np.array(res, dtype=np.float32)
|
||||
|
||||
assert np.array_equal(result1, res_array[0])
|
||||
assert np.array_equal(result2, res_array[1])
|
||||
assert np.array_equal(result3, res_array[2])
|
||||
assert np.array_equal(result4, res_array[3])
|
||||
assert np.array_equal(result5, res_array[4])
|
||||
assert np.array_equal(result6, res_array[5])
|
||||
assert np.array_equal(result7, res_array[6])
|
||||
|
||||
|
||||
def test_vectors_from_file_eager():
|
||||
"""
|
||||
Feature: Vectors
|
||||
Description: test with only default parameter in eager mode
|
||||
Expectation: output is equal to the expected value
|
||||
"""
|
||||
vectors = text.Vectors.from_file(DATASET_ROOT_PATH + "vectors.txt")
|
||||
to_vectors = T.ToVectors(vectors)
|
||||
result1 = to_vectors("ok")
|
||||
result2 = to_vectors("!")
|
||||
result3 = to_vectors("this")
|
||||
result4 = to_vectors("is")
|
||||
result5 = to_vectors("my")
|
||||
result6 = to_vectors("home")
|
||||
result7 = to_vectors("none")
|
||||
res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
|
||||
[0.013441, 0.23682, -0.16899, 0.40951, 0.63812, 0.47709],
|
||||
[0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
|
||||
[0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
|
||||
[0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246],
|
||||
[0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923],
|
||||
[0, 0, 0, 0, 0, 0]]
|
||||
res_array = np.array(res, dtype=np.float32)
|
||||
|
||||
assert np.array_equal(result1, res_array[0])
|
||||
assert np.array_equal(result2, res_array[1])
|
||||
assert np.array_equal(result3, res_array[2])
|
||||
assert np.array_equal(result4, res_array[3])
|
||||
assert np.array_equal(result5, res_array[4])
|
||||
assert np.array_equal(result6, res_array[5])
|
||||
assert np.array_equal(result7, res_array[6])
|
||||
|
||||
|
||||
def test_vectors_invalid_input():
|
||||
"""
|
||||
Feature: Vectors
|
||||
Description: test the validate function with invalid parameters.
|
||||
Expectation:
|
||||
"""
|
||||
def test_invalid_input(test_name, file_path, error, error_msg, max_vectors=None,
|
||||
unk_init=None, lower_case_backup=False, token="ok"):
|
||||
log.info("Test Vectors with wrong input: {0}".format(test_name))
|
||||
with pytest.raises(error) as error_info:
|
||||
vectors = text.Vectors.from_file(file_path, max_vectors=max_vectors)
|
||||
to_vectors = T.ToVectors(vectors, unk_init=unk_init, lower_case_backup=lower_case_backup)
|
||||
to_vectors(token)
|
||||
assert error_msg in str(error_info.value)
|
||||
|
||||
test_invalid_input("Not all vectors have the same number of dimensions",
|
||||
DATASET_ROOT_PATH + "vectors_dim_different.txt", error=RuntimeError,
|
||||
error_msg="all vectors must have the same number of dimensions, but got dim 5 while expecting 6")
|
||||
test_invalid_input("the file is empty.", DATASET_ROOT_PATH + "vectors_empty.txt",
|
||||
error=RuntimeError, error_msg="invalid file, file is empty.")
|
||||
test_invalid_input("the count of `unknown_init`'s element is different with word vector.",
|
||||
DATASET_ROOT_PATH + "vectors.txt",
|
||||
error=RuntimeError, error_msg="Unexpected error. ToVectors: " +
|
||||
"unk_init must be the same length as vectors, but got unk_init: 2 and vectors: 6",
|
||||
unk_init=[-1, -1])
|
||||
test_invalid_input("The file not exist", DATASET_ROOT_PATH + "not_exist.txt", error=RuntimeError,
|
||||
error_msg="get real path failed")
|
||||
test_invalid_input("The token is 1-dimensional",
|
||||
DATASET_ROOT_PATH + "vectors_with_wrong_info.txt", error=RuntimeError,
|
||||
error_msg="token with 1-dimensional vector.")
|
||||
test_invalid_input("max_vectors parameter must be greater than 0",
|
||||
DATASET_ROOT_PATH + "vectors.txt", error=ValueError,
|
||||
error_msg="Input max_vectors is not within the required interval", max_vectors=-1)
|
||||
test_invalid_input("invalid max_vectors parameter type as a float",
|
||||
DATASET_ROOT_PATH + "vectors.txt", error=TypeError,
|
||||
error_msg="Argument max_vectors with value 1.0 is not of type [<class 'int'>],"
|
||||
" but got <class 'float'>.", max_vectors=1.0)
|
||||
test_invalid_input("invalid max_vectors parameter type as a string",
|
||||
DATASET_ROOT_PATH + "vectors.txt", error=TypeError,
|
||||
error_msg="Argument max_vectors with value 1 is not of type [<class 'int'>],"
|
||||
" but got <class 'str'>.", max_vectors="1")
|
||||
test_invalid_input("invalid token parameter type as a float", DATASET_ROOT_PATH + "vectors.txt", error=RuntimeError,
|
||||
error_msg="input tensor type should be string.", token=1.0)
|
||||
test_invalid_input("invalid lower_case_backup parameter type as a string", DATASET_ROOT_PATH + "vectors.txt",
|
||||
error=TypeError, error_msg="Argument lower_case_backup with " +
|
||||
"value True is not of type [<class 'bool'>],"
|
||||
" but got <class 'str'>.", lower_case_backup="True")
|
||||
test_invalid_input("invalid lower_case_backup parameter type as a string", DATASET_ROOT_PATH + "vectors.txt",
|
||||
error=TypeError, error_msg="Argument lower_case_backup with " +
|
||||
"value True is not of type [<class 'bool'>],"
|
||||
" but got <class 'str'>.", lower_case_backup="True")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_vectors_all_tovectors_params_eager()
|
||||
test_vectors_from_file()
|
||||
test_vectors_from_file_all_buildfromfile_params()
|
||||
test_vectors_from_file_all_buildfromfile_params_eager()
|
||||
test_vectors_from_file_eager()
|
||||
test_vectors_invalid_input()
|
Loading…
Reference in New Issue