[fix] [assistant] [I3ZSQM] add new data operator Vectors

2021-11-04 20:42:41 +08:00 · 2021-11-04 20:42:41 +08:00 · 6a6e73ef82
parent acbaff17f7
commit 6a6e73ef82
28 changed files with 1391 additions and 20 deletions
--- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/bindings.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/bindings.cc
@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -18,13 +18,13 @@
 #include "pybind11/stl_bind.h"

 #include "minddata/dataset/api/python/pybind_register.h"
-#include "minddata/dataset/text/vocab.h"
-#include "minddata/dataset/text/sentence_piece_vocab.h"
 #include "minddata/dataset/include/dataset/constants.h"
+#include "minddata/dataset/text/sentence_piece_vocab.h"
+#include "minddata/dataset/text/vectors.h"
+#include "minddata/dataset/text/vocab.h"

 namespace mindspore {
 namespace dataset {
-
 PYBIND_REGISTER(Vocab, 0, ([](const py::module *m) {
                  (void)py::class_<Vocab, std::shared_ptr<Vocab>>(*m, "Vocab")
                    .def(py::init<>())
@ -88,5 +88,14 @@ PYBIND_REGISTER(SentencePieceModel, 0, ([](const py::module *m) {
                    .export_values();
                }));

+PYBIND_REGISTER(Vectors, 0, ([](const py::module *m) {
+                  (void)py::class_<Vectors, std::shared_ptr<Vectors>>(*m, "Vectors")
+                    .def(py::init<>())
+                    .def_static("from_file", [](const std::string &path, int32_t max_vectors) {
+                      std::shared_ptr<Vectors> vectors;
+                      THROW_IF_ERROR(Vectors::BuildFromFile(&vectors, path, max_vectors));
+                      return vectors;
+                    });
+                }));
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/ir/bindings.cc
@ -19,6 +19,7 @@
 #include "minddata/dataset/api/python/pybind_register.h"
 #include "minddata/dataset/text/ir/kernels/text_ir.h"
 #include "minddata/dataset/text/sentence_piece_vocab.h"
+#include "minddata/dataset/text/vectors.h"
 #include "minddata/dataset/text/vocab.h"

 namespace mindspore {
@ -208,6 +209,18 @@ PYBIND_REGISTER(ToNumberOperation, 1, ([](const py::module *m) {
                    }));
                }));

+PYBIND_REGISTER(
+  ToVectorsOperation, 1, ([](const py::module *m) {
+    (void)py::class_<text::ToVectorsOperation, TensorOperation, std::shared_ptr<text::ToVectorsOperation>>(
+      *m, "ToVectorsOperation")
+      .def(py::init(
+        [](const std::shared_ptr<Vectors> &vectors, const std::vector<float> &unk_init, bool lower_case_backup) {
+          auto to_vectors = std::make_shared<text::ToVectorsOperation>(vectors, unk_init, lower_case_backup);
+          THROW_IF_ERROR(to_vectors->ValidateParams());
+          return to_vectors;
+        }));
+  }));
+
 PYBIND_REGISTER(TruncateSequencePairOperation, 1, ([](const py::module *m) {
                  (void)py::class_<text::TruncateSequencePairOperation, TensorOperation,
                                   std::shared_ptr<text::TruncateSequencePairOperation>>(
--- a/mindspore/ccsrc/minddata/dataset/api/text.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/text.cc
@ -358,6 +358,22 @@ ToNumber::ToNumber(mindspore::DataType data_type) : data_(std::make_shared<Data>

 std::shared_ptr<TensorOperation> ToNumber::Parse() { return std::make_shared<ToNumberOperation>(data_->data_type_); }

+// ToVectors
+struct ToVectors::Data {
+  Data(const std::shared_ptr<Vectors> &vectors, const std::vector<float> &unk_init, bool lower_case_backup)
+      : vectors_(vectors), unk_init_(unk_init), lower_case_backup_(lower_case_backup) {}
+  std::shared_ptr<Vectors> vectors_;
+  std::vector<float> unk_init_;
+  bool lower_case_backup_;
+};
+
+ToVectors::ToVectors(const std::shared_ptr<Vectors> &vectors, const std::vector<float> unk_init, bool lower_case_backup)
+    : data_(std::make_shared<Data>(vectors, unk_init, lower_case_backup)) {}
+
+std::shared_ptr<TensorOperation> ToVectors::Parse() {
+  return std::make_shared<ToVectorsOperation>(data_->vectors_, data_->unk_init_, data_->lower_case_backup_);
+}
+
 // TruncateSequencePair
 struct TruncateSequencePair::Data {
  explicit Data(int32_t max_length) : max_length_(max_length) {}
--- a/mindspore/ccsrc/minddata/dataset/include/dataset/text.h
+++ b/mindspore/ccsrc/minddata/dataset/include/dataset/text.h
@ -31,13 +31,13 @@
 namespace mindspore {
 namespace dataset {

-class Vocab;
 class SentencePieceVocab;
 class TensorOperation;
+class Vectors;
+class Vocab;

 // Transform operations for text
 namespace text {
-
 #ifndef _WIN32
 /// \brief Tokenize a scalar tensor of UTF-8 string by specific rules.
 /// \note BasicTokenizer is not supported on the Windows platform yet.
@ -629,6 +629,30 @@ class MS_API ToNumber final : public TensorTransform {
  std::shared_ptr<Data> data_;
 };

+/// \brief Look up a token into an vector according to the input Vectors table.
+class ToVectors final : public TensorTransform {
+ public:
+  /// \brief Constructor.
+  /// \param[in] vectors A Vectors object.
+  /// \param[in] unk_init In case of the token is out-of-vectors (OOV), the result will be initialized with `unk_init`.
+  ///     (default={}, means to initialize with zero vectors).
+  /// \param[in] lower_case_backup Whether to look up the token in the lower case (default=false).
+  explicit ToVectors(const std::shared_ptr<Vectors> &vectors, std::vector<float> unk_init = {},
+                     bool lower_case_backup = false);
+
+  /// \brief Destructor
+  ~ToVectors() = default;
+
+ protected:
+  /// \brief The function to convert a TensorTransform object into a TensorOperation object.
+  /// \return Shared pointer to the TensorOperation object.
+  std::shared_ptr<TensorOperation> Parse() override;
+
+ private:
+  struct Data;
+  std::shared_ptr<Data> data_;
+};
+
 /// \brief Truncate a pair of rank-1 tensors such that the total length is less than max_length.
 class MS_API TruncateSequencePair final : public TensorTransform {
 public:
--- a/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h
@ -133,6 +133,7 @@ constexpr char kNormalizeUTF8Op[] = "NormalizeUTF8Op";
 constexpr char kRegexReplaceOp[] = "RegexReplaceOp";
 constexpr char kRegexTokenizerOp[] = "RegexTokenizerOp";
 constexpr char kToNumberOp[] = "ToNumberOp";
+constexpr char kToVectorsOp[] = "ToVectorsOp";
 constexpr char kTruncateSequencePairOp[] = "TruncateSequencePairOp";
 constexpr char kUnicodeCharTokenizerOp[] = "UnicodeCharTokenizerOp";
 constexpr char kUnicodeScriptTokenizerOp[] = "UnicodeScriptTokenizerOp";
--- a/mindspore/ccsrc/minddata/dataset/text/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/text/CMakeLists.txt
@ -4,6 +4,7 @@ add_subdirectory(kernels)
 file(GLOB _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
 set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
 add_library(text OBJECT
+        vectors.cc
        vocab.cc
        sentence_piece_vocab.cc
        )
--- a/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.cc
+++ b/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.cc
@ -33,6 +33,7 @@
 #include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
 #include "minddata/dataset/text/kernels/sliding_window_op.h"
 #include "minddata/dataset/text/kernels/to_number_op.h"
+#include "minddata/dataset/text/kernels/to_vectors_op.h"
 #include "minddata/dataset/text/kernels/truncate_sequence_pair_op.h"
 #include "minddata/dataset/text/kernels/unicode_char_tokenizer_op.h"
 #include "minddata/dataset/text/kernels/wordpiece_tokenizer_op.h"
@ -420,6 +421,27 @@ Status ToNumberOperation::from_json(nlohmann::json op_params, std::shared_ptr<Te
  return Status::OK();
 }

+// ToVectorsOperation
+ToVectorsOperation::ToVectorsOperation(const std::shared_ptr<Vectors> &vectors, const std::vector<float> &unk_init,
+                                       bool lower_case_backup)
+    : vectors_(vectors), unk_init_(unk_init), lower_case_backup_(lower_case_backup) {}
+
+ToVectorsOperation::~ToVectorsOperation() = default;
+
+Status ToVectorsOperation::ValidateParams() {
+  if (vectors_ == nullptr) {
+    std::string err_msg = "ToVectors: vectors can't be nullptr.";
+    MS_LOG(ERROR) << err_msg;
+    LOG_AND_RETURN_STATUS_SYNTAX_ERROR(err_msg);
+  }
+  return Status::OK();
+}
+
+std::shared_ptr<TensorOp> ToVectorsOperation::Build() {
+  std::shared_ptr<ToVectorsOp> tensor_op = std::make_shared<ToVectorsOp>(vectors_, unk_init_, lower_case_backup_);
+  return tensor_op;
+}
+
 // TruncateSequencePairOperation
 TruncateSequencePairOperation::TruncateSequencePairOperation(int32_t max_length) : max_length_(max_length) {}

--- a/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.h
+++ b/mindspore/ccsrc/minddata/dataset/text/ir/kernels/text_ir.h
@ -27,6 +27,7 @@

 namespace mindspore {
 namespace dataset {
+class Vectors;
 class Vocab;
 class SentencePieceVocab;

@ -45,6 +46,7 @@ constexpr char kRegexTokenizerOperation[] = "RegexTokenizer";
 constexpr char kSentencepieceTokenizerOperation[] = "SentencepieceTokenizer";
 constexpr char kSlidingWindowOperation[] = "SlidingWindow";
 constexpr char kToNumberOperation[] = "ToNumber";
+constexpr char kToVectorsOperation[] = "ToVectors";
 constexpr char kTruncateSequencePairOperation[] = "TruncateSequencePair";
 constexpr char kUnicodeCharTokenizerOperation[] = "UnicodeCharTokenizer";
 constexpr char kUnicodeScriptTokenizerOperation[] = "UnicodeScriptTokenizer";
@ -294,6 +296,25 @@ class ToNumberOperation : public TensorOperation {
  DataType data_type_;
 };

+class ToVectorsOperation : public TensorOperation {
+ public:
+  ToVectorsOperation(const std::shared_ptr<Vectors> &vectors, const std::vector<float> &unk_init,
+                     bool lower_case_backup);
+
+  ~ToVectorsOperation();
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+  std::string Name() const override { return kToVectorsOperation; }
+
+ private:
+  std::shared_ptr<Vectors> vectors_;
+  std::vector<float> unk_init_;
+  bool lower_case_backup_;
+};
+
 class TruncateSequencePairOperation : public TensorOperation {
 public:
  explicit TruncateSequencePairOperation(int32_t max_length);
--- a/mindspore/ccsrc/minddata/dataset/text/kernels/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/text/kernels/CMakeLists.txt
@ -22,6 +22,7 @@ add_library(text-kernels OBJECT
        wordpiece_tokenizer_op.cc
        truncate_sequence_pair_op.cc
        to_number_op.cc
+        to_vectors_op.cc
        sentence_piece_tokenizer_op.cc
        ${ICU_DEPEND_FILES}
        )
--- a/mindspore/ccsrc/minddata/dataset/text/kernels/to_vectors_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/text/kernels/to_vectors_op.cc
@ -0,0 +1,58 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "minddata/dataset/text/kernels/to_vectors_op.h"
+
+namespace mindspore {
+namespace dataset {
+ToVectorsOp::ToVectorsOp(const std::shared_ptr<Vectors> &vectors, const std::vector<float> &unk_init,
+                         bool lower_case_backup)
+    : vectors_(vectors), unk_init_(unk_init), lower_case_backup_(lower_case_backup) {}
+
+Status ToVectorsOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING, "ToVectors: input tensor type should be string.");
+  CHECK_FAIL_RETURN_UNEXPECTED(unk_init_.size() == 0 || unk_init_.size() == vectors_->Dim(),
+                               "ToVectors: unk_init must be the same length as vectors, but got unk_init: " +
+                                 std::to_string(unk_init_.size()) + " and vectors: " + std::to_string(vectors_->Dim()));
+
+  std::vector<float> vectors_vec;
+  int len = 0;
+  for (auto itr = input->begin<std::string_view>(); itr != input->end<std::string_view>(); ++itr) {
+    std::vector<float> vectors_value = vectors_->Lookup(std::string(*itr), unk_init_, lower_case_backup_);
+    CHECK_FAIL_RETURN_UNEXPECTED(!vectors_value.empty(), "ToVectors: invalid data, token: \"" + std::string(*itr) +
+                                                           "\" doesn't exist in vectors and no unk_init is specified.");
+    vectors_vec.insert(vectors_vec.end(), vectors_value.begin(), vectors_value.end());
+    len++;
+  }
+
+  int dim = static_cast<int>(vectors_vec.size() / len);
+  if (vectors_vec.size() == dim) {
+    RETURN_IF_NOT_OK(Tensor::CreateFromVector(vectors_vec, output));
+  } else {
+    RETURN_IF_NOT_OK(Tensor::CreateFromVector(vectors_vec, TensorShape({len, dim}), output));
+  }
+  return Status::OK();
+}
+
+Status ToVectorsOp::OutputType(const std::vector<DataType> &inputs, std::vector<DataType> &outputs) {
+  CHECK_FAIL_RETURN_UNEXPECTED(inputs.size() == NumInput() && outputs.size() == NumOutput(),
+                               "ToVectors: input and output size don't match.");
+  CHECK_FAIL_RETURN_UNEXPECTED(inputs[0] == DataType::DE_STRING, "ToVectors: input tensor type should be string.");
+  outputs[0] = DataType(DataType::DE_FLOAT32);
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/text/kernels/to_vectors_op.h
+++ b/mindspore/ccsrc/minddata/dataset/text/kernels/to_vectors_op.h
@ -0,0 +1,64 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_KERNELS_TO_VECTORS_OP_H_
+#define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_KERNELS_TO_VECTORS_OP_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "minddata/dataset/core/tensor.h"
+#include "minddata/dataset/kernels/tensor_op.h"
+#include "minddata/dataset/text/vectors.h"
+#include "minddata/dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+class ToVectorsOp : public TensorOp {
+ public:
+  /// \brief Constructor.
+  /// \param[in] vectors Vectors used to lookup tokens.
+  /// \param[in] unk_init Vector used to initialize OOV token.
+  /// \param[in] lower_case_backup Whether to look up the token in the lower case.
+  ToVectorsOp(const std::shared_ptr<Vectors> &vectors, const std::vector<float> &unk_init, bool lower_case_backup);
+
+  /// \brief Destructor.
+  ~ToVectorsOp() = default;
+
+  /// \brief Perform actual ToVectors on each tensor.
+  /// \param[in] input Input tensor.
+  /// \param[in] output Output tensor.
+  /// \return[out] Status code.
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+
+  /// \param[in] inputs DataType of input tensor.
+  /// \param[in] outputs DataType of output tensor.
+  /// \return[out] Status code.
+  Status OutputType(const std::vector<DataType> &inputs, std::vector<DataType> &outputs) override;
+
+  /// \brief Get Op name.
+  std::string Name() const override { return kToVectorsOp; }
+
+ private:
+  std::shared_ptr<Vectors> vectors_;
+  std::vector<float> unk_init_;
+  bool lower_case_backup_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_KERNELS_TO_VECTORS_OP_H_
--- a/mindspore/ccsrc/minddata/dataset/text/vectors.cc
+++ b/mindspore/ccsrc/minddata/dataset/text/vectors.cc
@ -0,0 +1,145 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "minddata/dataset/text/vectors.h"
+
+#include "utils/file_utils.h"
+
+namespace mindspore {
+namespace dataset {
+Status Vectors::InferShape(const std::string &path, int32_t max_vectors, int32_t *num_lines, int32_t *header_num_lines,
+                           int32_t *vector_dim) {
+  RETURN_UNEXPECTED_IF_NULL(num_lines);
+  RETURN_UNEXPECTED_IF_NULL(header_num_lines);
+  RETURN_UNEXPECTED_IF_NULL(vector_dim);
+
+  std::ifstream file_reader;
+  file_reader.open(path, std::ios::in);
+  CHECK_FAIL_RETURN_UNEXPECTED(file_reader.is_open(), "Vectors: invalid file, failed to open vector file: " + path);
+
+  *num_lines = 0, *header_num_lines = 0, *vector_dim = -1;
+  std::string line, row;
+  while (std::getline(file_reader, line)) {
+    if (*vector_dim == -1) {
+      std::vector<std::string> vec;
+      std::istringstream line_reader(line);
+      while (std::getline(line_reader, row, ' ')) {
+        vec.push_back(row);
+      }
+      // The number of rows and dimensions can be obtained directly from the information header.
+      const int kInfoHeaderSize = 2;
+      if (vec.size() == kInfoHeaderSize) {
+        (*header_num_lines)++;
+      } else {
+        *vector_dim = vec.size() - 1;
+        (*num_lines)++;
+      }
+    } else {
+      (*num_lines)++;
+    }
+  }
+  CHECK_FAIL_RETURN_UNEXPECTED(*num_lines > 0, "Vectors: invalid file, file is empty.");
+
+  if (max_vectors > 0) {
+    *num_lines = std::min(max_vectors, *num_lines);  // Determine the true rows.
+  }
+  return Status::OK();
+}
+
+Status Vectors::Load(const std::string &path, int32_t max_vectors,
+                     std::unordered_map<std::string, std::vector<float>> *map, int *vector_dim) {
+  RETURN_UNEXPECTED_IF_NULL(map);
+  RETURN_UNEXPECTED_IF_NULL(vector_dim);
+  auto realpath = FileUtils::GetRealPath(common::SafeCStr(path));
+  CHECK_FAIL_RETURN_UNEXPECTED(realpath.has_value(), "Vectors: get real path failed, path: " + path);
+  auto file_path = realpath.value();
+
+  CHECK_FAIL_RETURN_UNEXPECTED(max_vectors >= 0,
+                               "Vectors: max_vectors must be non negative, but got: " + std::to_string(max_vectors));
+
+  int num_lines = 0, header_num_lines = 0;
+  RETURN_IF_NOT_OK(InferShape(file_path, max_vectors, &num_lines, &header_num_lines, vector_dim));
+
+  std::fstream file_reader;
+  file_reader.open(file_path, std::ios::in);
+  CHECK_FAIL_RETURN_UNEXPECTED(file_reader.is_open(),
+                               "Vectors: invalid file, failed to open vector file: " + file_path);
+
+  while (header_num_lines > 0) {
+    file_reader.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
+    header_num_lines--;
+  }
+
+  std::string line, token, vector_value;
+  for (auto i = 0; i < num_lines; ++i) {
+    std::getline(file_reader, line);
+    std::istringstream line_reader(line);
+    std::getline(line_reader, token, ' ');
+    std::vector<float> vector_values;
+    int dim = 0;
+    while (line_reader >> vector_value) {
+      dim++;
+      vector_values.push_back(atof(vector_value.c_str()));
+    }
+    CHECK_FAIL_RETURN_UNEXPECTED(dim > 1, "Vectors: token with 1-dimensional vector.");
+    CHECK_FAIL_RETURN_UNEXPECTED(dim == *vector_dim,
+                                 "Vectors: all vectors must have the same number of dimensions, but got dim " +
+                                   std::to_string(dim) + " while expecting " + std::to_string(*vector_dim));
+
+    auto token_index = map->find(token);
+    if (token_index == map->end()) {
+      (*map)[token] = vector_values;
+    }
+  }
+  return Status::OK();
+}
+
+Vectors::Vectors(const std::unordered_map<std::string, std::vector<float>> &map, int dim) {
+  map_ = std::move(map);
+  dim_ = dim;
+}
+
+Status Vectors::BuildFromFile(std::shared_ptr<Vectors> *vectors, const std::string &path, int32_t max_vectors) {
+  std::unordered_map<std::string, std::vector<float>> map;
+  int vector_dim = -1;
+  RETURN_IF_NOT_OK(Load(path, max_vectors, &map, &vector_dim));
+  *vectors = std::make_shared<Vectors>(std::move(map), vector_dim);
+  return Status::OK();
+}
+
+std::vector<float> Vectors::Lookup(const std::string &token, const std::vector<float> &unk_init,
+                                   bool lower_case_backup) {
+  std::vector<float> init_vec(dim_, 0);
+  if (!unk_init.empty()) {
+    if (unk_init.size() != dim_) {
+      MS_LOG(WARNING) << "Vectors: size of unk_init is not the same as vectors, will initialize with zero vectors.";
+    } else {
+      init_vec = unk_init;
+    }
+  }
+  std::string lower_token = token;
+  if (lower_case_backup) {
+    transform(lower_token.begin(), lower_token.end(), lower_token.begin(), ::tolower);
+  }
+  auto str_index = map_.find(lower_token);
+  if (str_index == map_.end()) {
+    return init_vec;
+  } else {
+    return str_index->second;
+  }
+}
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/text/vectors.h
+++ b/mindspore/ccsrc/minddata/dataset/text/vectors.h
@ -0,0 +1,89 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_VECTORS_H_
+#define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_VECTORS_H_
+
+#include <algorithm>
+#include <fstream>
+#include <limits>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "minddata/dataset/core/tensor.h"
+#include "minddata/dataset/include/dataset/iterator.h"
+
+namespace mindspore {
+namespace dataset {
+/// \brief Pre-train word vectors.
+class Vectors {
+ public:
+  /// Constructor.
+  Vectors() = default;
+
+  /// Constructor.
+  /// \param[in] map A map between string and vector.
+  /// \param[in] dim Dimension of the vectors.
+  Vectors(const std::unordered_map<std::string, std::vector<float>> &map, int dim);
+
+  /// Destructor.
+  virtual ~Vectors() = default;
+
+  /// \brief Build Vectors from reading a pre-train vector file.
+  /// \param[out] vectors Vectors object which contains the pre-train vectors.
+  /// \param[in] path Path to the pre-trained word vector file.
+  /// \param[in] max_vectors This can be used to limit the number of pre-trained vectors loaded (default=0, no limit).
+  static Status BuildFromFile(std::shared_ptr<Vectors> *vectors, const std::string &path, int32_t max_vectors = 0);
+
+  /// \brief Look up embedding vectors of token.
+  /// \param[in] token A token to be looked up.
+  /// \param[in] unk_init In case of the token is out-of-vectors (OOV), the result will be initialized with `unk_init`.
+  ///     (default={}, means to initialize with zero vectors).
+  /// \param[in] lower_case_backup Whether to look up the token in the lower case (Default = false).
+  /// \return The vector of the input token.
+  virtual std::vector<float> Lookup(const std::string &token, const std::vector<float> &unk_init = {},
+                                    bool lower_case_backup = false);
+
+  /// \brief Getter of dimension.
+  const int &Dim() const { return dim_; }
+
+ protected:
+  /// \brief Infer the shape of the pre-trained word vector file.
+  /// \param[in] path Path to the pre-trained word vector file.
+  /// \param[in] max_vectors Maximum number of pre-trained word vectors to be read.
+  /// \param[out] num_lines The number of lines of the file.
+  /// \param[out] header_num_lines The number of lines of file header.
+  /// \param[out] vector_dim The dimension of the vectors in the file.
+  static Status InferShape(const std::string &path, int32_t max_vectors, int32_t *num_lines, int32_t *header_num_lines,
+                           int32_t *vector_dim);
+
+  /// \brief Load map from reading a pre-train vector file.
+  /// \param[in] path Path to the pre-trained word vector file.
+  /// \param[in] max_vectors This can be used to limit the number of pre-trained vectors loaded, must be non negative.
+  /// \param[out] map The map between words and vectors.
+  /// \param[out] vector_dim The dimension of the vectors in the file.
+  static Status Load(const std::string &path, int32_t max_vectors,
+                     std::unordered_map<std::string, std::vector<float>> *map, int *vector_dim);
+
+  int dim_;
+  std::unordered_map<std::string, std::vector<float>> map_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_VECTORS_H_
--- a/mindspore/dataset/text/init.py
+++ b/mindspore/dataset/text/init.py
@ -26,15 +26,15 @@ Common imported modules in corresponding API examples are as follows:
 """
 import platform
 from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer, \
-    TruncateSequencePair, ToNumber, SlidingWindow, SentencePieceTokenizer, PythonTokenizer
+    TruncateSequencePair, ToNumber, SlidingWindow, SentencePieceTokenizer, PythonTokenizer, ToVectors
 from .utils import to_str, to_bytes, JiebaMode, Vocab, NormalizeForm, SentencePieceVocab, SentencePieceModel, \
-    SPieceTokenizerOutType, SPieceTokenizerLoadType
+    SPieceTokenizerOutType, SPieceTokenizerLoadType, Vectors

 __all__ = [
    "Lookup", "JiebaTokenizer", "UnicodeCharTokenizer", "Ngram",
    "to_str", "to_bytes", "Vocab", "WordpieceTokenizer", "TruncateSequencePair", "ToNumber",
    "PythonTokenizer", "SlidingWindow", "SentencePieceVocab", "SentencePieceTokenizer", "SPieceTokenizerOutType",
-    "SentencePieceModel", "SPieceTokenizerLoadType", "JiebaMode", "NormalizeForm",
+    "SentencePieceModel", "SPieceTokenizerLoadType", "JiebaMode", "NormalizeForm", "Vectors", "ToVectors"
 ]

 if platform.system().lower() != 'windows':
--- a/mindspore/dataset/text/transforms.py
+++ b/mindspore/dataset/text/transforms.py
@ -48,7 +48,7 @@ import mindspore._c_dataengine as cde
 from mindspore.common import dtype as mstype

 from .utils import JiebaMode, NormalizeForm, to_str, SPieceTokenizerOutType, SPieceTokenizerLoadType
-from .validators import check_lookup, check_jieba_add_dict, \
+from .validators import check_lookup, check_jieba_add_dict, check_to_vectors, \
    check_jieba_add_word, check_jieba_init, check_with_offsets, check_unicode_script_tokenizer, \
    check_wordpiece_tokenizer, check_regex_replace, check_regex_tokenizer, check_basic_tokenizer, check_ngram, \
    check_pair_truncate, check_to_number, check_bert_tokenizer, check_python_tokenizer, check_slidingwindow, \
@ -345,6 +345,7 @@ class SentencePieceTokenizer(TextTensorOperation):
        >>> tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
        >>> text_file_dataset = text_file_dataset.map(operations=tokenizer)
    """
+
    @check_sentence_piece_tokenizer
    def __init__(self, mode, out_type):
        self.mode = mode
@ -421,6 +422,36 @@ class ToNumber(TextTensorOperation):
        return cde.ToNumberOperation(self.data_type)


+class ToVectors(TextTensorOperation):
+    """
+    Look up a token into vectors according to the input vector table.
+
+    Args:
+        vectors (Vectors): A vectors object.
+        unk_init (sequence, optional): Sequence used to initialize out-of-vectors (OOV) token
+            (default=None, initialize with zero vectors).
+        lower_case_backup (bool, optional): Whether to look up the token in the lower case. If False, each token in the
+            original case will be looked up; if True, each token in the original case will be looked up first, if not
+            found in the keys of the property stoi, the token in the lower case will be looked up (default=False).
+
+    Examples:
+        >>> # Load vectors from file
+        >>> vectors = text.Vectors.from_file("/path/to/vectors/file")
+        >>> # Use ToVectors operator to map tokens to vectors
+        >>> to_vectors = text.ToVectors(vectors)
+        >>> text_file_dataset = text_file_dataset.map(operations=[to_vectors])
+    """
+
+    @check_to_vectors
+    def __init__(self, vectors, unk_init=None, lower_case_backup=False):
+        self.vectors = vectors
+        self.unk_init = unk_init if unk_init is not None else []
+        self.lower_case_backup = lower_case_backup
+
+    def parse(self):
+        return cde.ToVectorsOperation(self.vectors, self.unk_init, self.lower_case_backup)
+
+
 class TruncateSequencePair(TextTensorOperation):
    """
    Truncate a pair of rank-1 tensors such that the total length is less than max_length.
--- a/mindspore/dataset/text/utils.py
+++ b/mindspore/dataset/text/utils.py
@ -1,4 +1,4 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
+# Copyright 2020-2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -16,16 +16,18 @@ The module text.utils provides some general methods for NLP text processing.
 For example, you can use Vocab to build a dictionary,
 use to_bytes and to_str to encode and decode strings into a specified format.
 """
+
 from enum import IntEnum

 import numpy as np
-import mindspore._c_dataengine as cde

+import mindspore._c_dataengine as cde
 from .validators import check_from_file, check_from_list, check_from_dict, check_from_dataset, \
-    check_from_dataset_sentencepiece, check_from_file_sentencepiece, check_save_model
+    check_from_dataset_sentencepiece, check_from_file_sentencepiece, check_save_model, \
+    check_from_file_vectors

 __all__ = [
-    "Vocab", "SentencePieceVocab", "to_str", "to_bytes"
+    "Vocab", "SentencePieceVocab", "to_str", "to_bytes", "Vectors"
 ]


@ -383,3 +385,29 @@ class SPieceTokenizerLoadType(IntEnum):
    """
    FILE = 0
    MODEL = 1
+
+
+class Vectors(cde.Vectors):
+    """
+    Vectors object that is used to map tokens into vectors.
+    """
+
+    @classmethod
+    @check_from_file_vectors
+    def from_file(cls, file_path, max_vectors=None):
+        """
+        Build a vector from a file.
+
+        Args:
+            file_path (str): Path of the file that contains the vectors.
+            max_vectors (int, optional): This can be used to limit the number of pre-trained vectors loaded.
+                Most pre-trained vector sets are sorted in the descending order of word frequency. Thus, in
+                situations where the entire set doesn’t fit in memory, or is not needed for another reason,
+                passing max_vectors can limit the size of the loaded set (default=None, no limit).
+
+        Examples:
+            >>> vector = text.Vectors.from_file("/path/to/vectors/file", max_vectors=None)
+        """
+
+        max_vectors = max_vectors if max_vectors is not None else 0
+        return super().from_file(file_path, max_vectors)
--- a/mindspore/dataset/text/validators.py
+++ b/mindspore/dataset/text/validators.py
@ -15,15 +15,14 @@
 """
 validators for text ops
 """
-
 from functools import wraps
-import mindspore.common.dtype as mstype

 import mindspore._c_dataengine as cde
+import mindspore.common.dtype as mstype
 from mindspore._c_expression import typing

 from ..core.validator_helpers import parse_user_args, type_check, type_check_list, check_uint32, \
-    INT32_MAX, check_value, check_positive, check_pos_int32
+    INT32_MAX, check_value, check_positive, check_pos_int32, check_filename, check_non_negative_int32


 def check_unique_list_of_words(words, arg_name):
@ -532,3 +531,39 @@ def check_sentence_piece_tokenizer(method):
        return method(self, *args, **kwargs)

    return new_method
+
+
+def check_from_file_vectors(method):
+    """A wrapper that wraps a parameter checker to from_file of class Vectors."""
+
+    @wraps(method)
+    def new_method(self, *args, **kwargs):
+        [file_path, max_vectors], _ = parse_user_args(method, *args, **kwargs)
+
+        type_check(file_path, (str,), "file_path")
+        check_filename(file_path)
+        if max_vectors is not None:
+            type_check(max_vectors, (int,), "max_vectors")
+            check_non_negative_int32(max_vectors, "max_vectors")
+
+        return method(self, *args, **kwargs)
+
+    return new_method
+
+
+def check_to_vectors(method):
+    """A wrapper that wraps a parameter checker to ToVectors."""
+
+    @wraps(method)
+    def new_method(self, *args, **kwargs):
+        [vectors, unk_init, lower_case_backup], _ = parse_user_args(method, *args, **kwargs)
+
+        type_check(vectors, (cde.Vectors,), "vectors")
+        if unk_init is not None:
+            type_check(unk_init, (list, tuple), "unk_init")
+            for i, value in enumerate(unk_init):
+                type_check(value, (int, float), "unk_init[{0}]".format(i))
+        type_check(lower_case_backup, (bool,), "lower_case_backup")
+        return method(self, *args, **kwargs)
+
+    return new_method
--- a/tests/ut/cpp/dataset/CMakeLists.txt
+++ b/tests/ut/cpp/dataset/CMakeLists.txt
@ -51,6 +51,7 @@ SET(DE_UT_SRCS
        c_api_samplers_test.cc
        c_api_text_sentence_piece_vocab_test.cc
        c_api_text_vocab_test.cc
+        c_api_text_test.cc
        c_api_transforms_test.cc
        c_api_vision_a_to_q_test.cc
        c_api_vision_affine_test.cc
--- a/tests/ut/cpp/dataset/c_api_text_test.cc
+++ b/tests/ut/cpp/dataset/c_api_text_test.cc
@ -14,8 +14,8 @@
 * limitations under the License.
 */
 #include <memory>
-#include <vector>
 #include <string>
+#include <vector>

 #include "common/common.h"
 #include "include/api/status.h"
@ -23,12 +23,14 @@
 #include "minddata/dataset/include/dataset/datasets.h"
 #include "minddata/dataset/include/dataset/text.h"
 #include "minddata/dataset/include/dataset/transforms.h"
+#include "minddata/dataset/text/vectors.h"
 #include "minddata/dataset/text/vocab.h"

 using namespace mindspore::dataset;
 using mindspore::Status;
 using mindspore::dataset::ShuffleMode;
 using mindspore::dataset::Tensor;
+using mindspore::dataset::Vectors;
 using mindspore::dataset::Vocab;

 class MindDataTestPipeline : public UT::DatasetOpTesting {
@ -1596,7 +1598,8 @@ TEST_F(MindDataTestPipeline, TestToNumberSuccess2) {
  EXPECT_NE(ds, nullptr);

  // Create ToNumber operation on ds
-  std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeFloat64);
+  std::shared_ptr<TensorTransform> to_number =
+    std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeFloat64);
  EXPECT_NE(to_number, nullptr);

  // Create a Map operation on ds
@ -3543,3 +3546,400 @@ TEST_F(MindDataTestPipeline, TestWhitespaceTokenizerSuccess1) {
  // Manually terminate the pipeline
  iter->Stop();
 }
+
+/// Feature: Vectors
+/// Description: test with default parameter in function BuildFromFile and function Lookup
+/// Expectation: return correct MSTensor which is equal to the expected
+TEST_F(MindDataTestPipeline, TestVectorsDefaultParam) {
+  // Test with default parameter.
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsDefaultParam.";
+
+  // Create a TextFile dataset
+  std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
+  std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
+  EXPECT_NE(ds, nullptr);
+
+  std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
+  std::shared_ptr<Vectors> vectors;
+  Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
+  EXPECT_EQ(s, Status::OK());
+
+  std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors);
+  EXPECT_NE(lookup, nullptr);
+
+  // Create Map operation on ds
+  ds = ds->Map({lookup}, {"text"});
+  EXPECT_NE(ds, nullptr);
+
+  // Create an iterator over the result of the above dataset
+  std::shared_ptr<Iterator> iter = ds->CreateIterator();
+  EXPECT_NE(iter, nullptr);
+
+  // Iterate the dataset and get each row
+  std::unordered_map<std::string, mindspore::MSTensor> row;
+  ASSERT_OK(iter->GetNextRow(&row));
+
+  uint64_t i = 0;
+  std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
+                                              {0, 0, 0, 0, 0, 0},
+                                              {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
+                                              {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
+                                              {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
+                                              {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
+                                              {0, 0, 0, 0, 0, 0}};
+  while (row.size() != 0) {
+    auto ind = row["text"];
+    MS_LOG(INFO) << ind.Shape();
+    TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
+    TensorPtr de_expected_item;
+    dsize_t dim = 6;
+    ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
+    mindspore::MSTensor ms_expected_item =
+      mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
+    EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
+
+    ASSERT_OK(iter->GetNextRow(&row));
+    i++;
+  }
+
+  EXPECT_EQ(i, 7);
+
+  // Manually terminate the pipeline
+  iter->Stop();
+}
+
+/// Feature: Vectors
+/// Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile
+/// Expectation: return correct MSTensor which is equal to the expected
+TEST_F(MindDataTestPipeline, TestVectorsAllBuildfromfileParams) {
+  // Test with two parameters.
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsAllBuildfromfileParams.";
+
+  // Create a TextFile dataset
+  std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
+  std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
+  EXPECT_NE(ds, nullptr);
+
+  std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
+  std::shared_ptr<Vectors> vectors;
+  Status s = Vectors::BuildFromFile(&vectors, vectors_dir, 100);
+  EXPECT_EQ(s, Status::OK());
+
+  std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors);
+  EXPECT_NE(lookup, nullptr);
+
+  // Create Map operation on ds
+  ds = ds->Map({lookup}, {"text"});
+  EXPECT_NE(ds, nullptr);
+
+  // Create an iterator over the result of the above dataset
+  std::shared_ptr<Iterator> iter = ds->CreateIterator();
+  EXPECT_NE(iter, nullptr);
+
+  // Iterate the dataset and get each row
+  std::unordered_map<std::string, mindspore::MSTensor> row;
+  ASSERT_OK(iter->GetNextRow(&row));
+
+  uint64_t i = 0;
+  std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
+                                              {0, 0, 0, 0, 0, 0},
+                                              {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
+                                              {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
+                                              {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
+                                              {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
+                                              {0, 0, 0, 0, 0, 0}};
+  while (row.size() != 0) {
+    auto ind = row["text"];
+    MS_LOG(INFO) << ind.Shape();
+    TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
+    TensorPtr de_expected_item;
+    dsize_t dim = 6;
+    ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
+    mindspore::MSTensor ms_expected_item =
+      mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
+    EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
+
+    ASSERT_OK(iter->GetNextRow(&row));
+    i++;
+  }
+
+  EXPECT_EQ(i, 7);
+
+  // Manually terminate the pipeline
+  iter->Stop();
+}
+
+/// Feature: Vectors
+/// Description: test with all parameters in function BuildFromFile and `unknown_init` in function Lookup
+/// Expectation: return correct MSTensor which is equal to the expected
+TEST_F(MindDataTestPipeline, TestVectorsUnknownInit) {
+  // Test with two parameters.
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsUnknownInit.";
+
+  // Create a TextFile dataset
+  std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
+  std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
+  EXPECT_NE(ds, nullptr);
+
+  std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
+  std::shared_ptr<Vectors> vectors;
+  Status s = Vectors::BuildFromFile(&vectors, vectors_dir, 100);
+  EXPECT_EQ(s, Status::OK());
+
+  std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
+  std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors, unknown_init);
+  EXPECT_NE(lookup, nullptr);
+
+  // Create Map operation on ds
+  ds = ds->Map({lookup}, {"text"});
+  EXPECT_NE(ds, nullptr);
+
+  // Create an iterator over the result of the above dataset
+  std::shared_ptr<Iterator> iter = ds->CreateIterator();
+  EXPECT_NE(iter, nullptr);
+
+  // Iterate the dataset and get each row
+  std::unordered_map<std::string, mindspore::MSTensor> row;
+  ASSERT_OK(iter->GetNextRow(&row));
+
+  uint64_t i = 0;
+  std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
+                                              {-1, -1, -1, -1, -1, -1},
+                                              {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
+                                              {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
+                                              {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
+                                              {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
+                                              {-1, -1, -1, -1, -1, -1}};
+  while (row.size() != 0) {
+    auto ind = row["text"];
+    MS_LOG(INFO) << ind.Shape();
+    TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
+    TensorPtr de_expected_item;
+    dsize_t dim = 6;
+    ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
+    mindspore::MSTensor ms_expected_item =
+      mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
+    EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
+
+    ASSERT_OK(iter->GetNextRow(&row));
+    i++;
+  }
+
+  EXPECT_EQ(i, 7);
+
+  // Manually terminate the pipeline
+  iter->Stop();
+}
+
+/// Feature: Vectors
+/// Description: test with all parameters which include `path` and `max_vectors` in function BuildFromFile and `token`,
+///     `unknown_init` and `lower_case_backup` in function Lookup. But some tokens have some big letters
+/// Expectation: return correct MSTensor which is equal to the expected
+TEST_F(MindDataTestPipeline, TestVectorsAllParams) {
+  //  Test with all parameters.
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsAllParams.";
+  // Create a TextFile dataset
+  std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
+  std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
+  EXPECT_NE(ds, nullptr);
+
+  std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
+  std::shared_ptr<Vectors> vectors;
+  Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
+  EXPECT_EQ(s, Status::OK());
+
+  std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
+  std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors, unknown_init, true);
+  EXPECT_NE(lookup, nullptr);
+
+  // Create Map operation on ds
+  ds = ds->Map({lookup}, {"text"});
+  EXPECT_NE(ds, nullptr);
+
+  // Create an iterator over the result of the above dataset
+  std::shared_ptr<Iterator> iter = ds->CreateIterator();
+  EXPECT_NE(iter, nullptr);
+
+  // Iterate the dataset and get each row
+  std::unordered_map<std::string, mindspore::MSTensor> row;
+  ASSERT_OK(iter->GetNextRow(&row));
+
+  uint64_t i = 0;
+  std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
+                                              {-1, -1, -1, -1, -1, -1},
+                                              {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
+                                              {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
+                                              {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
+                                              {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
+                                              {-1, -1, -1, -1, -1, -1}};
+  while (row.size() != 0) {
+    auto ind = row["text"];
+    MS_LOG(INFO) << ind.Shape();
+    TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
+    TensorPtr de_expected_item;
+    dsize_t dim = 6;
+    ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
+    mindspore::MSTensor ms_expected_item =
+      mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
+    EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
+
+    ASSERT_OK(iter->GetNextRow(&row));
+    i++;
+  }
+
+  EXPECT_EQ(i, 7);
+
+  // Manually terminate the pipeline
+  iter->Stop();
+}
+
+/// Feature: Vectors
+/// Description: test with pre-vectors set that have the different dimension
+/// Expectation: throw correct error and message
+TEST_F(MindDataTestPipeline, TestVectorsDifferentDimension) {
+  //  Tokens don't have the same number of vectors.
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsDifferentDimension.";
+
+  // Create a TextFile dataset
+  std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
+  std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
+  EXPECT_NE(ds, nullptr);
+
+  std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_dim_different.txt";
+  std::shared_ptr<Vectors> vectors;
+  Status s = Vectors::BuildFromFile(&vectors, vectors_dir, 100);
+  EXPECT_NE(s, Status::OK());
+}
+
+/// Feature: Vectors
+/// Description: test with pre-vectors set that has the head-info
+/// Expectation: return correct MSTensor which is equal to the expected
+TEST_F(MindDataTestPipeline, TestVectorsWithHeadInfo) {
+  //  Test with words that has head info.
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithHeadInfo.";
+  // Create a TextFile dataset
+  std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
+  std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
+  EXPECT_NE(ds, nullptr);
+
+  std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_with_info.txt";
+  std::shared_ptr<Vectors> vectors;
+  Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
+  EXPECT_EQ(s, Status::OK());
+
+  std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
+  std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors, unknown_init, true);
+  EXPECT_NE(lookup, nullptr);
+
+  // Create Map operation on ds
+  ds = ds->Map({lookup}, {"text"});
+  EXPECT_NE(ds, nullptr);
+
+  // Create an iterator over the result of the above dataset
+  std::shared_ptr<Iterator> iter = ds->CreateIterator();
+  EXPECT_NE(iter, nullptr);
+
+  // Iterate the dataset and get each row
+  std::unordered_map<std::string, mindspore::MSTensor> row;
+  ASSERT_OK(iter->GetNextRow(&row));
+
+  uint64_t i = 0;
+  std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
+                                              {-1, -1, -1, -1, -1, -1},
+                                              {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
+                                              {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
+                                              {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
+                                              {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
+                                              {-1, -1, -1, -1, -1, -1}};
+  while (row.size() != 0) {
+    auto ind = row["text"];
+    MS_LOG(INFO) << ind.Shape();
+    TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
+    TensorPtr de_expected_item;
+    dsize_t dim = 6;
+    ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
+    mindspore::MSTensor ms_expected_item =
+      mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
+    EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
+
+    ASSERT_OK(iter->GetNextRow(&row));
+    i++;
+  }
+
+  EXPECT_EQ(i, 7);
+
+  // Manually terminate the pipeline
+  iter->Stop();
+}
+
+/// Feature: Vectors
+/// Description: test with the parameter max_vectors that is <= 0
+/// Expectation: throw correct error and message
+TEST_F(MindDataTestPipeline, TestVectorsMaxVectorsLessThanZero) {
+  //  Test with max_vectors <= 0.
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsMaxVectorsLessThanZero.";
+
+  // Create a TextFile dataset
+  std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
+  std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
+  EXPECT_NE(ds, nullptr);
+
+  std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
+  std::shared_ptr<Vectors> vectors;
+  Status s = Vectors::BuildFromFile(&vectors, vectors_dir, -1);
+  EXPECT_NE(s, Status::OK());
+}
+
+/// Feature: Vectors
+/// Description: test with the pre-vectors file that is empty
+/// Expectation: throw correct error and message
+TEST_F(MindDataTestPipeline, TestVectorsWithEmptyFile) {
+  //  Read empty file.
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithEmptyFile.";
+
+  // Create a TextFile dataset
+  std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
+  std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
+  EXPECT_NE(ds, nullptr);
+
+  std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_empty.txt";
+  std::shared_ptr<Vectors> vectors;
+  Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
+  EXPECT_NE(s, Status::OK());
+}
+
+/// Feature: Vectors
+/// Description: test with the pre-vectors file that is not exist
+/// Expectation: throw correct error and message
+TEST_F(MindDataTestPipeline, TestVectorsWithNotExistFile) {
+  //  Test with not exist file.
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithNotExistFile.";
+
+  // Create a TextFile dataset
+  std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
+  std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
+  EXPECT_NE(ds, nullptr);
+
+  std::string vectors_dir = datasets_root_path_ + "/testVectors/no_vectors.txt";
+  std::shared_ptr<Vectors> vectors;
+  Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
+  EXPECT_NE(s, Status::OK());
+}
+
+/// Feature: Vectors
+/// Description: test with the pre-vectors set that has a situation that info-head is not the first line in the set
+/// Expectation: throw correct error and message
+TEST_F(MindDataTestPipeline, TestVectorsWithWrongInfoFile) {
+  //  wrong info.
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithWrongInfoFile.";
+
+  // Create a TextFile dataset
+  std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
+  std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
+  EXPECT_NE(ds, nullptr);
+
+  std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_with_wrong_info.txt";
+  std::shared_ptr<Vectors> vectors;
+  Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
+  EXPECT_NE(s, Status::OK());
+}
--- a/tests/ut/cpp/dataset/execute_test.cc
+++ b/tests/ut/cpp/dataset/execute_test.cc
@ -23,10 +23,12 @@
 #include "minddata/dataset/include/dataset/vision.h"
 #include "minddata/dataset/include/dataset/audio.h"
 #include "minddata/dataset/include/dataset/text.h"
+#include "minddata/dataset/text/vectors.h"
 #include "utils/log_adapter.h"

 using namespace mindspore::dataset;
 using mindspore::LogStream;
+using mindspore::dataset::Vectors;
 using mindspore::ExceptionType::NoExceptionType;
 using mindspore::MsLogLevel::INFO;

@ -1529,6 +1531,140 @@ TEST_F(MindDataTestExecute, TestFlangerWithWrongArg) {
  EXPECT_FALSE(s01.IsOk());
 }

+/// Feature: Vectors
+/// Description: test basic usage of Vectors and the ToVectors with default parameter
+/// Expectation: get correct MSTensor
+TEST_F(MindDataTestExecute, TestVectorsParam) {
+  MS_LOG(INFO) << "Doing MindDataTestExecute-TestVectorsParam.";
+  std::shared_ptr<Tensor> de_tensor;
+  Tensor::CreateScalar<std::string>("ok", &de_tensor);
+  auto token = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor));
+  mindspore::MSTensor lookup_result;
+
+  // Create expected output.
+  std::shared_ptr<Tensor> de_expected;
+  std::vector<float> expected = {0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411};
+  dsize_t dim = 6;
+  ASSERT_OK(Tensor::CreateFromVector(expected, TensorShape({dim}), &de_expected));
+  auto ms_expected = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected));
+
+  // Transform params.
+  std::string vectors_dir = "data/dataset/testVectors/vectors.txt";
+  std::shared_ptr<Vectors> vectors01;
+  Status s01 = Vectors::BuildFromFile(&vectors01, vectors_dir);
+  EXPECT_EQ(s01, Status::OK());
+  std::shared_ptr<TensorTransform> to_vectors01 = std::make_shared<text::ToVectors>(vectors01);
+  auto transform01 = Execute({to_vectors01});
+  Status status01 = transform01(token, &lookup_result);
+  EXPECT_MSTENSOR_EQ(lookup_result, ms_expected);
+  EXPECT_TRUE(status01.IsOk());
+
+  std::shared_ptr<Vectors> vectors02;
+  Status s02 = Vectors::BuildFromFile(&vectors02, vectors_dir, 100);
+  EXPECT_EQ(s02, Status::OK());
+  std::shared_ptr<TensorTransform> to_vectors02 = std::make_shared<text::ToVectors>(vectors02);
+  auto transform02 = Execute({to_vectors02});
+  Status status02 = transform02(token, &lookup_result);
+  EXPECT_MSTENSOR_EQ(lookup_result, ms_expected);
+  EXPECT_TRUE(status02.IsOk());
+
+  std::shared_ptr<Vectors> vectors03;
+  Status s03 = Vectors::BuildFromFile(&vectors03, vectors_dir, 3);
+  EXPECT_EQ(s03, Status::OK());
+  std::shared_ptr<TensorTransform> to_vectors03 = std::make_shared<text::ToVectors>(vectors03);
+  auto transform03 = Execute({to_vectors03});
+  Status status03 = transform03(token, &lookup_result);
+  EXPECT_MSTENSOR_EQ(lookup_result, ms_expected);
+  EXPECT_TRUE(status03.IsOk());
+}
+
+/// Feature: ToVectors
+/// Description: test basic usage of ToVectors and the Vectors with default parameter
+/// Expectation: get correct MSTensor
+TEST_F(MindDataTestExecute, TestToVectorsParam) {
+  MS_LOG(INFO) << "Doing MindDataTestExecute-TestToVectorsParam.";
+  std::shared_ptr<Tensor> de_tensor01;
+  Tensor::CreateScalar<std::string>("none", &de_tensor01);
+  auto token01 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor01));
+  std::shared_ptr<Tensor> de_tensor02;
+  Tensor::CreateScalar<std::string>("ok", &de_tensor02);
+  auto token02 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor02));
+  std::shared_ptr<Tensor> de_tensor03;
+  Tensor::CreateScalar<std::string>("OK", &de_tensor03);
+  auto token03 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor03));
+  mindspore::MSTensor lookup_result;
+
+  // Create expected output.
+  dsize_t dim = 6;
+  std::shared_ptr<Tensor> de_expected01;
+  std::vector<float> expected01 = {0, 0, 0, 0, 0, 0};
+  ASSERT_OK(Tensor::CreateFromVector(expected01, TensorShape({dim}), &de_expected01));
+  auto ms_expected01 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected01));
+  std::shared_ptr<Tensor> de_expected02;
+  std::vector<float> expected02 = {-1, -1, -1, -1, -1, -1};
+  ASSERT_OK(Tensor::CreateFromVector(expected02, TensorShape({dim}), &de_expected02));
+  auto ms_expected02 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected02));
+  std::shared_ptr<Tensor> de_expected03;
+  std::vector<float> expected03 = {0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411};
+  ASSERT_OK(Tensor::CreateFromVector(expected03, TensorShape({dim}), &de_expected03));
+  auto ms_expected03 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected03));
+
+  // Transform params.
+  std::string vectors_dir = "data/dataset/testVectors/vectors.txt";
+  std::shared_ptr<Vectors> vectors;
+  Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
+  EXPECT_EQ(s, Status::OK());
+
+  std::shared_ptr<TensorTransform> to_vectors01 = std::make_shared<text::ToVectors>(vectors);
+  auto transform01 = Execute({to_vectors01});
+  Status status01 = transform01(token01, &lookup_result);
+  EXPECT_MSTENSOR_EQ(lookup_result, ms_expected01);
+  EXPECT_TRUE(status01.IsOk());
+  std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
+  std::shared_ptr<TensorTransform> to_vectors02 = std::make_shared<text::ToVectors>(vectors, unknown_init);
+  auto transform02 = Execute({to_vectors02});
+  Status status02 = transform02(token01, &lookup_result);
+  EXPECT_MSTENSOR_EQ(lookup_result, ms_expected02);
+  EXPECT_TRUE(status02.IsOk());
+  std::shared_ptr<TensorTransform> to_vectors03 = std::make_shared<text::ToVectors>(vectors, unknown_init);
+  auto transform03 = Execute({to_vectors03});
+  Status status03 = transform03(token02, &lookup_result);
+  EXPECT_MSTENSOR_EQ(lookup_result, ms_expected03);
+  EXPECT_TRUE(status03.IsOk());
+  std::shared_ptr<TensorTransform> to_vectors04 = std::make_shared<text::ToVectors>(vectors, unknown_init, true);
+  auto transform04 = Execute({to_vectors04});
+  Status status04 = transform04(token03, &lookup_result);
+  EXPECT_MSTENSOR_EQ(lookup_result, ms_expected03);
+  EXPECT_TRUE(status04.IsOk());
+}
+
+/// Feature: ToVectors
+/// Description: test invalid parameter of ToVectors
+/// Expectation: throw exception correctly
+TEST_F(MindDataTestExecute, TestToVectorsWithInvalidParam) {
+  MS_LOG(INFO) << "Doing MindDataTestExecute-TestToVectorsWithInvalidParam.";
+  std::shared_ptr<Tensor> de_tensor;
+  Tensor::CreateScalar<std::string>("none", &de_tensor);
+  auto token = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor));
+  mindspore::MSTensor lookup_result;
+
+  // Transform params.
+  std::string vectors_dir = "data/dataset/testVectors/vectors.txt";
+  std::shared_ptr<Vectors> vectors01;
+  Status s = Vectors::BuildFromFile(&vectors01, vectors_dir);
+  EXPECT_EQ(s, Status::OK());
+  std::vector<float> unknown_init = {-1, -1, -1, -1};
+  std::shared_ptr<TensorTransform> to_vectors01 = std::make_shared<text::ToVectors>(vectors01, unknown_init);
+  auto transform01 = Execute({to_vectors01});
+  Status status01 = transform01(token, &lookup_result);
+  EXPECT_FALSE(status01.IsOk());
+  std::shared_ptr<Vectors> vectors02 = nullptr;
+  std::shared_ptr<TensorTransform> to_vectors02 = std::make_shared<text::ToVectors>(vectors02);
+  auto transform02 = Execute({to_vectors02});
+  Status status02 = transform02(token, &lookup_result);
+  EXPECT_FALSE(status02.IsOk());
+}
+
 // Feature: DBToAmplitude
 // Description: test DBToAmplitude in eager mode
 // Expectation: the data is processed successfully
--- a/tests/ut/data/dataset/testVectors/vectors.txt
+++ b/tests/ut/data/dataset/testVectors/vectors.txt
@ -0,0 +1,6 @@
+ok 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445718411
+! 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709 
+this 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973 
+is 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603 
+my 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246 
+home 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923 
--- a/tests/ut/data/dataset/testVectors/vectors_dim_different.txt
+++ b/tests/ut/data/dataset/testVectors/vectors_dim_different.txt
@ -0,0 +1,6 @@
+ok 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445718411
+! 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709 
+this 0.15164 0.30177 -0.16763 0.17684 0.31719 
+is 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603 
+my 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246 
+home 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923
--- a/tests/ut/data/dataset/testVectors/vectors_empty.txt
+++ b/tests/ut/data/dataset/testVectors/vectors_empty.txt
--- a/tests/ut/data/dataset/testVectors/vectors_with_info.txt
+++ b/tests/ut/data/dataset/testVectors/vectors_with_info.txt
@ -0,0 +1,7 @@
+6 6
+ok 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445718411
+! 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709 
+this 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973 
+is 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603 
+my 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246 
+home 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923 
--- a/tests/ut/data/dataset/testVectors/vectors_with_wrong_info.txt
+++ b/tests/ut/data/dataset/testVectors/vectors_with_wrong_info.txt
@ -0,0 +1,7 @@
+the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445718411
+, 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709 
+. 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973
+6 6 
+of 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603 
+to 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246 
+and 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923
--- a/tests/ut/data/dataset/testVectors/words.txt
+++ b/tests/ut/data/dataset/testVectors/words.txt
@ -0,0 +1,7 @@
+ok
+.
+this
+is
+my
+home
+.
--- a/tests/ut/data/dataset/testVectors/words_with_big_letter.txt
+++ b/tests/ut/data/dataset/testVectors/words_with_big_letter.txt
@ -0,0 +1,7 @@
+ok
+!
+This
+iS
+my
+HOME
+.
--- a/tests/ut/python/dataset/test_vectors.py
+++ b/tests/ut/python/dataset/test_vectors.py
@ -0,0 +1,236 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import numpy as np
+import pytest
+
+from mindspore import log
+import mindspore.dataset as ds
+import mindspore.dataset.text as text
+import mindspore.dataset.text.transforms as T
+
+DATASET_ROOT_PATH = "../data/dataset/testVectors/"
+
+
+def test_vectors_all_tovectors_params_eager():
+    """
+    Feature: Vectors
+    Description: test with all parameters which include `unk_init`
+        and `lower_case_backup` in function ToVectors in eager mode
+    Expectation: output is equal to the expected value
+    """
+    vectors = text.Vectors.from_file(DATASET_ROOT_PATH + "vectors.txt", max_vectors=4)
+    myUnk = [-1, -1, -1, -1, -1, -1]
+    to_vectors = T.ToVectors(vectors, unk_init=myUnk, lower_case_backup=True)
+    result1 = to_vectors("Ok")
+    result2 = to_vectors("!")
+    result3 = to_vectors("This")
+    result4 = to_vectors("is")
+    result5 = to_vectors("my")
+    result6 = to_vectors("home")
+    result7 = to_vectors("none")
+    res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
+           [0.013441, 0.23682, -0.16899, 0.40951, 0.63812, 0.47709],
+           [0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
+           [0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
+           [-1, -1, -1, -1, -1, -1],
+           [-1, -1, -1, -1, -1, -1],
+           [-1, -1, -1, -1, -1, -1]]
+    res_array = np.array(res, dtype=np.float32)
+
+    assert np.array_equal(result1, res_array[0])
+    assert np.array_equal(result2, res_array[1])
+    assert np.array_equal(result3, res_array[2])
+    assert np.array_equal(result4, res_array[3])
+    assert np.array_equal(result5, res_array[4])
+    assert np.array_equal(result6, res_array[5])
+    assert np.array_equal(result7, res_array[6])
+
+
+def test_vectors_from_file():
+    """
+    Feature: Vectors
+    Description: test with only default parameter
+    Expectation: output is equal to the expected value
+    """
+    vectors = text.Vectors.from_file(DATASET_ROOT_PATH + "vectors.txt")
+    to_vectors = text.ToVectors(vectors)
+    data = ds.TextFileDataset(DATASET_ROOT_PATH + "words.txt", shuffle=False)
+    data = data.map(operations=to_vectors, input_columns=["text"])
+    ind = 0
+    res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
+           [0, 0, 0, 0, 0, 0],
+           [0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
+           [0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
+           [0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246],
+           [0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923],
+           [0, 0, 0, 0, 0, 0]]
+    for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
+        res_array = np.array(res[ind], dtype=np.float32)
+        assert np.array_equal(res_array, d["text"]), ind
+        ind += 1
+
+
+def test_vectors_from_file_all_buildfromfile_params():
+    """
+    Feature: Vectors
+    Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile
+    Expectation: output is equal to the expected value
+    """
+    vectors = text.Vectors.from_file(DATASET_ROOT_PATH + "vectors.txt", max_vectors=100)
+    to_vectors = text.ToVectors(vectors)
+    data = ds.TextFileDataset(DATASET_ROOT_PATH + "words.txt", shuffle=False)
+    data = data.map(operations=to_vectors, input_columns=["text"])
+    ind = 0
+    res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
+           [0, 0, 0, 0, 0, 0],
+           [0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
+           [0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
+           [0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246],
+           [0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923],
+           [0, 0, 0, 0, 0, 0]]
+    print(data)
+    for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
+        res_array = np.array(res[ind], dtype=np.float32)
+        assert np.array_equal(res_array, d["text"]), ind
+        ind += 1
+
+
+def test_vectors_from_file_all_buildfromfile_params_eager():
+    """
+    Feature: Vectors
+    Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile in eager mode
+    Expectation: output is equal to the expected value
+    """
+    vectors = text.Vectors.from_file(DATASET_ROOT_PATH + "vectors.txt", max_vectors=4)
+    to_vectors = T.ToVectors(vectors)
+    result1 = to_vectors("ok")
+    result2 = to_vectors("!")
+    result3 = to_vectors("this")
+    result4 = to_vectors("is")
+    result5 = to_vectors("my")
+    result6 = to_vectors("home")
+    result7 = to_vectors("none")
+    res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
+           [0.013441, 0.23682, -0.16899, 0.40951, 0.63812, 0.47709],
+           [0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
+           [0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
+           [0, 0, 0, 0, 0, 0],
+           [0, 0, 0, 0, 0, 0],
+           [0, 0, 0, 0, 0, 0]]
+    res_array = np.array(res, dtype=np.float32)
+
+    assert np.array_equal(result1, res_array[0])
+    assert np.array_equal(result2, res_array[1])
+    assert np.array_equal(result3, res_array[2])
+    assert np.array_equal(result4, res_array[3])
+    assert np.array_equal(result5, res_array[4])
+    assert np.array_equal(result6, res_array[5])
+    assert np.array_equal(result7, res_array[6])
+
+
+def test_vectors_from_file_eager():
+    """
+    Feature: Vectors
+    Description: test with only default parameter in eager mode
+    Expectation: output is equal to the expected value
+    """
+    vectors = text.Vectors.from_file(DATASET_ROOT_PATH + "vectors.txt")
+    to_vectors = T.ToVectors(vectors)
+    result1 = to_vectors("ok")
+    result2 = to_vectors("!")
+    result3 = to_vectors("this")
+    result4 = to_vectors("is")
+    result5 = to_vectors("my")
+    result6 = to_vectors("home")
+    result7 = to_vectors("none")
+    res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
+           [0.013441, 0.23682, -0.16899, 0.40951, 0.63812, 0.47709],
+           [0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
+           [0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
+           [0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246],
+           [0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923],
+           [0, 0, 0, 0, 0, 0]]
+    res_array = np.array(res, dtype=np.float32)
+
+    assert np.array_equal(result1, res_array[0])
+    assert np.array_equal(result2, res_array[1])
+    assert np.array_equal(result3, res_array[2])
+    assert np.array_equal(result4, res_array[3])
+    assert np.array_equal(result5, res_array[4])
+    assert np.array_equal(result6, res_array[5])
+    assert np.array_equal(result7, res_array[6])
+
+
+def test_vectors_invalid_input():
+    """
+    Feature: Vectors
+    Description: test the validate function with invalid parameters.
+    Expectation:
+    """
+    def test_invalid_input(test_name, file_path, error, error_msg, max_vectors=None,
+                           unk_init=None, lower_case_backup=False, token="ok"):
+        log.info("Test Vectors with wrong input: {0}".format(test_name))
+        with pytest.raises(error) as error_info:
+            vectors = text.Vectors.from_file(file_path, max_vectors=max_vectors)
+            to_vectors = T.ToVectors(vectors, unk_init=unk_init, lower_case_backup=lower_case_backup)
+            to_vectors(token)
+        assert error_msg in str(error_info.value)
+
+    test_invalid_input("Not all vectors have the same number of dimensions",
+                       DATASET_ROOT_PATH + "vectors_dim_different.txt", error=RuntimeError,
+                       error_msg="all vectors must have the same number of dimensions, but got dim 5 while expecting 6")
+    test_invalid_input("the file is empty.", DATASET_ROOT_PATH + "vectors_empty.txt",
+                       error=RuntimeError, error_msg="invalid file, file is empty.")
+    test_invalid_input("the count of `unknown_init`'s element is different with word vector.",
+                       DATASET_ROOT_PATH + "vectors.txt",
+                       error=RuntimeError, error_msg="Unexpected error. ToVectors: " +
+                       "unk_init must be the same length as vectors, but got unk_init: 2 and vectors: 6",
+                       unk_init=[-1, -1])
+    test_invalid_input("The file not exist", DATASET_ROOT_PATH + "not_exist.txt", error=RuntimeError,
+                       error_msg="get real path failed")
+    test_invalid_input("The token is 1-dimensional",
+                       DATASET_ROOT_PATH + "vectors_with_wrong_info.txt", error=RuntimeError,
+                       error_msg="token with 1-dimensional vector.")
+    test_invalid_input("max_vectors parameter must be greater than 0",
+                       DATASET_ROOT_PATH + "vectors.txt", error=ValueError,
+                       error_msg="Input max_vectors is not within the required interval", max_vectors=-1)
+    test_invalid_input("invalid max_vectors parameter type as a float",
+                       DATASET_ROOT_PATH + "vectors.txt", error=TypeError,
+                       error_msg="Argument max_vectors with value 1.0 is not of type [<class 'int'>],"
+                       " but got <class 'float'>.", max_vectors=1.0)
+    test_invalid_input("invalid max_vectors parameter type as a string",
+                       DATASET_ROOT_PATH + "vectors.txt", error=TypeError,
+                       error_msg="Argument max_vectors with value 1 is not of type [<class 'int'>],"
+                       " but got <class 'str'>.", max_vectors="1")
+    test_invalid_input("invalid token parameter type as a float", DATASET_ROOT_PATH + "vectors.txt", error=RuntimeError,
+                       error_msg="input tensor type should be string.", token=1.0)
+    test_invalid_input("invalid lower_case_backup parameter type as a string", DATASET_ROOT_PATH + "vectors.txt",
+                       error=TypeError, error_msg="Argument lower_case_backup with " +
+                       "value True is not of type [<class 'bool'>],"
+                       " but got <class 'str'>.", lower_case_backup="True")
+    test_invalid_input("invalid lower_case_backup parameter type as a string", DATASET_ROOT_PATH + "vectors.txt",
+                       error=TypeError, error_msg="Argument lower_case_backup with " +
+                       "value True is not of type [<class 'bool'>],"
+                       " but got <class 'str'>.", lower_case_backup="True")
+
+
+if __name__ == '__main__':
+    test_vectors_all_tovectors_params_eager()
+    test_vectors_from_file()
+    test_vectors_from_file_all_buildfromfile_params()
+    test_vectors_from_file_all_buildfromfile_params_eager()
+    test_vectors_from_file_eager()
+    test_vectors_invalid_input()