!22712 [assistant][ops] Add FastText

Merge pull request !22712 from 无言/FastText
This commit is contained in:
i-robot 2021-12-01 06:50:59 +00:00 committed by Gitee
commit 0c6505db26
18 changed files with 922 additions and 5 deletions

View File

@ -19,6 +19,7 @@
#include "minddata/dataset/api/python/pybind_register.h"
#include "minddata/dataset/include/dataset/constants.h"
#include "minddata/dataset/text/fast_text.h"
#include "minddata/dataset/text/sentence_piece_vocab.h"
#include "minddata/dataset/text/vectors.h"
#include "minddata/dataset/text/vocab.h"
@ -88,6 +89,16 @@ PYBIND_REGISTER(SentencePieceModel, 0, ([](const py::module *m) {
.export_values();
}));
PYBIND_REGISTER(FastText, 1, ([](const py::module *m) {
(void)py::class_<FastText, Vectors, std::shared_ptr<FastText>>(*m, "FastText")
.def(py::init<>())
.def_static("from_file", [](const std::string &path, int32_t max_vectors) {
std::shared_ptr<FastText> fast_text;
THROW_IF_ERROR(FastText::BuildFromFile(&fast_text, path, max_vectors));
return fast_text;
});
}));
PYBIND_REGISTER(Vectors, 0, ([](const py::module *m) {
(void)py::class_<Vectors, std::shared_ptr<Vectors>>(*m, "Vectors")
.def(py::init<>())

View File

@ -630,7 +630,7 @@ class MS_API ToNumber final : public TensorTransform {
};
/// \brief Look up a token into an vector according to the input Vectors table.
class ToVectors final : public TensorTransform {
class MS_API ToVectors final : public TensorTransform {
public:
/// \brief Constructor.
/// \param[in] vectors A Vectors object.

View File

@ -4,9 +4,10 @@ add_subdirectory(kernels)
file(GLOB _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
add_library(text OBJECT
fast_text.cc
sentence_piece_vocab.cc
vectors.cc
vocab.cc
sentence_piece_vocab.cc
)
add_dependencies(text text-kernels)

View File

@ -0,0 +1,50 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "minddata/dataset/text/fast_text.h"
#include "utils/file_utils.h"
namespace mindspore {
namespace dataset {
FastText::FastText(const std::unordered_map<std::string, std::vector<float>> &map, int dim) : Vectors(map, dim) {}
Status CheckFastText(const std::string &file_path) {
Path path = Path(file_path);
if (path.Exists() && !path.IsDirectory()) {
std::string basename = path.Basename();
size_t dot = basename.rfind('.');
std::string suffix = basename.substr(dot + 1);
if (suffix != "vec") {
RETURN_STATUS_UNEXPECTED("FastText: invalid file, can not find file '*.vec', but got: " + file_path);
}
return Status::OK();
} else {
RETURN_STATUS_UNEXPECTED("FastText: invalid file, failed to open FastText file.");
}
}
Status FastText::BuildFromFile(std::shared_ptr<FastText> *fast_text, const std::string &path, int32_t max_vectors) {
RETURN_UNEXPECTED_IF_NULL(fast_text);
RETURN_IF_NOT_OK(CheckFastText(path));
std::unordered_map<std::string, std::vector<float>> map;
int vector_dim = -1;
RETURN_IF_NOT_OK(Load(path, max_vectors, &map, &vector_dim));
*fast_text = std::make_shared<FastText>(std::move(map), vector_dim);
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

View File

@ -0,0 +1,55 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_FAST_TEXT_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_FAST_TEXT_H_
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "minddata/dataset/core/tensor.h"
#include "minddata/dataset/include/dataset/iterator.h"
#include "minddata/dataset/text/vectors.h"
#include "minddata/dataset/util/path.h"
namespace mindspore {
namespace dataset {
/// \brief Pre-train word vectors.
class FastText : public Vectors {
public:
/// Constructor.
FastText() = default;
/// Constructor.
/// \param[in] map A map between string and vector.
/// \param[in] dim Dimension of the vectors.
FastText(const std::unordered_map<std::string, std::vector<float>> &map, int dim);
/// Destructor.
~FastText() = default;
/// \brief Build Vectors from reading a pre-train vector file.
/// \param[out] fast_text FastText object which contains the pre-train vectors.
/// \param[in] path Path to the pre-trained word vector file. The suffix of set must be `*.vec`.
/// \param[in] max_vectors This can be used to limit the number of pre-trained vectors loaded (default=0, no limit).
static Status BuildFromFile(std::shared_ptr<FastText> *fast_text, const std::string &path, int32_t max_vectors = 0);
};
} // namespace dataset
} // namespace mindspore
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_FAST_TEXT_H_

View File

@ -30,6 +30,8 @@ namespace dataset {
class Vectors;
class Vocab;
class SentencePieceVocab;
class Vectors;
class Vocab;
// Transform operations for text
namespace text {

View File

@ -28,13 +28,13 @@ import platform
from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer, \
TruncateSequencePair, ToNumber, SlidingWindow, SentencePieceTokenizer, PythonTokenizer, ToVectors
from .utils import to_str, to_bytes, JiebaMode, Vocab, NormalizeForm, SentencePieceVocab, SentencePieceModel, \
SPieceTokenizerOutType, SPieceTokenizerLoadType, Vectors
SPieceTokenizerOutType, SPieceTokenizerLoadType, Vectors, FastText
__all__ = [
"Lookup", "JiebaTokenizer", "UnicodeCharTokenizer", "Ngram",
"to_str", "to_bytes", "Vocab", "WordpieceTokenizer", "TruncateSequencePair", "ToNumber",
"PythonTokenizer", "SlidingWindow", "SentencePieceVocab", "SentencePieceTokenizer", "SPieceTokenizerOutType",
"SentencePieceModel", "SPieceTokenizerLoadType", "JiebaMode", "NormalizeForm", "Vectors", "ToVectors"
"SentencePieceModel", "SPieceTokenizerLoadType", "JiebaMode", "NormalizeForm", "Vectors", "ToVectors", "FastText"
]
if platform.system().lower() != 'windows':

View File

@ -27,7 +27,7 @@ from .validators import check_from_file, check_from_list, check_from_dict, check
check_from_file_vectors
__all__ = [
"Vocab", "SentencePieceVocab", "to_str", "to_bytes", "Vectors"
"Vocab", "SentencePieceVocab", "to_str", "to_bytes", "Vectors", "FastText"
]
@ -411,3 +411,30 @@ class Vectors(cde.Vectors):
max_vectors = max_vectors if max_vectors is not None else 0
return super().from_file(file_path, max_vectors)
class FastText(cde.FastText):
"""
FastText object that is used to map tokens into vectors.
"""
@classmethod
@check_from_file_vectors
def from_file(cls, file_path, max_vectors=None):
"""
Build a FastText vector from a file.
Args:
file_path (str): Path of the file that contains the vectors. The shuffix of pre-trained vector sets
must be `*.vec`.
max_vectors (int, optional): This can be used to limit the number of pre-trained vectors loaded.
Most pre-trained vector sets are sorted in the descending order of word frequency. Thus, in
situations where the entire set doesnt fit in memory, or is not needed for another reason,
passing max_vectors can limit the size of the loaded set (default=None, no limit).
Examples:
>>> fast_text = text.FastText.from_file("/path/to/fast_text/file", max_vectors=None)
"""
max_vectors = max_vectors if max_vectors is not None else 0
return super().from_file(file_path, max_vectors)

View File

@ -23,11 +23,13 @@
#include "minddata/dataset/include/dataset/datasets.h"
#include "minddata/dataset/include/dataset/text.h"
#include "minddata/dataset/include/dataset/transforms.h"
#include "minddata/dataset/text/fast_text.h"
#include "minddata/dataset/text/vectors.h"
#include "minddata/dataset/text/vocab.h"
using namespace mindspore::dataset;
using mindspore::Status;
using mindspore::dataset::FastText;
using mindspore::dataset::ShuffleMode;
using mindspore::dataset::Tensor;
using mindspore::dataset::Vectors;
@ -3943,3 +3945,357 @@ TEST_F(MindDataTestPipeline, TestVectorsWithWrongInfoFile) {
Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
EXPECT_NE(s, Status::OK());
}
/// Feature: FastText
/// Description: test with default parameter in function BuildFromFile and function Lookup
/// Expectation: return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline, TestFastTextDefaultParam) {
// Test with default parameter.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextDefaultParam.";
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
std::shared_ptr<FastText> fast_text;
Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
EXPECT_EQ(s, Status::OK());
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(fast_text);
EXPECT_NE(lookup, nullptr);
// Create Map operation on ds
ds = ds->Map({lookup}, {"text"});
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));
uint64_t i = 0;
std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
{0, 0, 0, 0, 0, 0},
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
{0, 0, 0, 0, 0, 0}};
while (row.size() != 0) {
auto ind = row["text"];
MS_LOG(INFO) << ind.Shape();
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
TensorPtr de_expected_item;
dsize_t dim = 6;
ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
mindspore::MSTensor ms_expected_item =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
ASSERT_OK(iter->GetNextRow(&row));
i++;
}
EXPECT_EQ(i, 7);
// Manually terminate the pipeline
iter->Stop();
}
/// Feature: FastText
/// Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile
/// Expectation: return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline, TestFastTextAllBuildfromfileParams) {
// Test with two parameters.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextAllBuildfromfileParams.";
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
std::shared_ptr<FastText> fast_text;
Status s = FastText::BuildFromFile(&fast_text, vectors_dir, 100);
EXPECT_EQ(s, Status::OK());
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(fast_text);
EXPECT_NE(lookup, nullptr);
// Create Map operation on ds
ds = ds->Map({lookup}, {"text"});
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));
uint64_t i = 0;
std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
{0, 0, 0, 0, 0, 0},
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
{0, 0, 0, 0, 0, 0}};
while (row.size() != 0) {
auto ind = row["text"];
MS_LOG(INFO) << ind.Shape();
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
TensorPtr de_expected_item;
dsize_t dim = 6;
ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
mindspore::MSTensor ms_expected_item =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
ASSERT_OK(iter->GetNextRow(&row));
i++;
}
EXPECT_EQ(i, 7);
// Manually terminate the pipeline
iter->Stop();
}
/// Feature: FastText
/// Description: test with all parameters in function BuildFromFile and `unknown_init` in function Lookup
/// Expectation: return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline, TestFastTextUnknownInit) {
// Test with two parameters.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextUnknownInit.";
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
std::shared_ptr<FastText> fast_text;
Status s = FastText::BuildFromFile(&fast_text, vectors_dir, 100);
EXPECT_EQ(s, Status::OK());
std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(fast_text, unknown_init);
EXPECT_NE(lookup, nullptr);
// Create Map operation on ds
ds = ds->Map({lookup}, {"text"});
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));
uint64_t i = 0;
std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
{-1, -1, -1, -1, -1, -1},
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
{-1, -1, -1, -1, -1, -1}};
while (row.size() != 0) {
auto ind = row["text"];
MS_LOG(INFO) << ind.Shape();
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
TensorPtr de_expected_item;
dsize_t dim = 6;
ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
mindspore::MSTensor ms_expected_item =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
ASSERT_OK(iter->GetNextRow(&row));
i++;
}
EXPECT_EQ(i, 7);
// Manually terminate the pipeline
iter->Stop();
}
/// Feature: FastText
/// Description: test with all parameters which include `path` and `max_vectors` in function BuildFromFile and `token`,
/// `unknown_init` and `lower_case_backup` in function Lookup. But some tokens have some big letters
/// Expectation: return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline, TestFastTextAllParams) {
// Test with all parameters.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextAllParams.";
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
std::shared_ptr<FastText> fast_text;
Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
EXPECT_EQ(s, Status::OK());
std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(fast_text, unknown_init, true);
EXPECT_NE(lookup, nullptr);
// Create Map operation on ds
ds = ds->Map({lookup}, {"text"});
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));
uint64_t i = 0;
std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
{-1, -1, -1, -1, -1, -1},
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
{-1, -1, -1, -1, -1, -1}};
while (row.size() != 0) {
auto ind = row["text"];
MS_LOG(INFO) << ind.Shape();
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
TensorPtr de_expected_item;
dsize_t dim = 6;
ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
mindspore::MSTensor ms_expected_item =
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
ASSERT_OK(iter->GetNextRow(&row));
i++;
}
EXPECT_EQ(i, 7);
// Manually terminate the pipeline
iter->Stop();
}
/// Feature: FastText
/// Description: test with pre-vectors set that have the different dimension
/// Expectation: throw correct error and message
TEST_F(MindDataTestPipeline, TestFastTextDifferentDimension) {
// Tokens don't have the same number of vectors.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextDifferentDimension.";
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fasttext_dim_different.vec";
std::shared_ptr<FastText> fast_text;
Status s = FastText::BuildFromFile(&fast_text, vectors_dir, 100);
EXPECT_NE(s, Status::OK());
}
/// Feature: FastText
/// Description: test with the parameter max_vectors that is <= 0
/// Expectation: throw correct error and message
TEST_F(MindDataTestPipeline, TestFastTextMaxVectorsLessThanZero) {
// Test with max_vectors <= 0.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextMaxVectorsLessThanZero.";
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
std::shared_ptr<FastText> fast_text;
Status s = FastText::BuildFromFile(&fast_text, vectors_dir, -1);
EXPECT_NE(s, Status::OK());
}
/// Feature: FastText
/// Description: test with the pre-vectors file that is empty
/// Expectation: throw correct error and message
TEST_F(MindDataTestPipeline, TestFastTextWithEmptyFile) {
// Read empty file.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithEmptyFile.";
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fasttext_empty.vec";
std::shared_ptr<FastText> fast_text;
Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
EXPECT_NE(s, Status::OK());
}
/// Feature: FastText
/// Description: test with the pre-vectors file that is not exist
/// Expectation: throw correct error and message
TEST_F(MindDataTestPipeline, TestFastTextWithNotExistFile) {
// Test with not exist file.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithNotExistFile.";
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
std::string vectors_dir = datasets_root_path_ + "/test_fast_text/no_fasttext.vec";
std::shared_ptr<FastText> fast_text;
Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
EXPECT_NE(s, Status::OK());
}
/// Feature: FastText
/// Description: test with the pre-vectors set that has a situation that info-head is not the first line in the set
/// Expectation: throw correct error and message
TEST_F(MindDataTestPipeline, TestFastTextWithWrongInfoFile) {
// wrong info.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithWrongInfoFile.";
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fasttext_with_wrong_info.vec";
std::shared_ptr<FastText> fast_text;
Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
EXPECT_NE(s, Status::OK());
}
/// Feature: FastText
/// Description: test with the pre-vectors set that has a wrong suffix
/// Expectation: throw correct error and message
TEST_F(MindDataTestPipeline, TestFastTextWithWrongSuffix) {
// wrong info.
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithWrongSuffix.";
// Create a TextFile dataset
std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds, nullptr);
std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.txt";
std::shared_ptr<FastText> fast_text;
Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
EXPECT_NE(s, Status::OK());
}

View File

@ -23,11 +23,13 @@
#include "minddata/dataset/include/dataset/vision.h"
#include "minddata/dataset/include/dataset/audio.h"
#include "minddata/dataset/include/dataset/text.h"
#include "minddata/dataset/text/fast_text.h"
#include "minddata/dataset/text/vectors.h"
#include "utils/log_adapter.h"
using namespace mindspore::dataset;
using mindspore::LogStream;
using mindspore::dataset::FastText;
using mindspore::dataset::Vectors;
using mindspore::ExceptionType::NoExceptionType;
using mindspore::MsLogLevel::INFO;
@ -1665,6 +1667,140 @@ TEST_F(MindDataTestExecute, TestToVectorsWithInvalidParam) {
EXPECT_FALSE(status02.IsOk());
}
/// Feature: FastText
/// Description: test basic usage of FastText and the ToVectors with default parameter
/// Expectation: get correct MSTensor
TEST_F(MindDataTestExecute, TestFastTextParam) {
MS_LOG(INFO) << "Doing MindDataTestExecute-TestFastTextParam.";
std::shared_ptr<Tensor> de_tensor;
Tensor::CreateScalar<std::string>("ok", &de_tensor);
auto token = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor));
mindspore::MSTensor lookup_result;
// Create expected output.
std::shared_ptr<Tensor> de_expected;
std::vector<float> expected = {0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411};
dsize_t dim = 6;
ASSERT_OK(Tensor::CreateFromVector(expected, TensorShape({dim}), &de_expected));
auto ms_expected = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected));
// Transform params.
std::string vectors_dir = "data/dataset/test_fast_text/fast_text.vec";
std::shared_ptr<FastText> fast_text01;
Status s01 = FastText::BuildFromFile(&fast_text01, vectors_dir);
EXPECT_EQ(s01, Status::OK());
std::shared_ptr<TensorTransform> to_vectors01 = std::make_shared<text::ToVectors>(fast_text01);
auto transform01 = Execute({to_vectors01});
Status status01 = transform01(token, &lookup_result);
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected);
EXPECT_TRUE(status01.IsOk());
std::shared_ptr<FastText> fast_text02;
Status s02 = FastText::BuildFromFile(&fast_text02, vectors_dir, 100);
EXPECT_EQ(s02, Status::OK());
std::shared_ptr<TensorTransform> to_vectors02 = std::make_shared<text::ToVectors>(fast_text02);
auto transform02 = Execute({to_vectors02});
Status status02 = transform02(token, &lookup_result);
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected);
EXPECT_TRUE(status02.IsOk());
std::shared_ptr<FastText> fast_text03;
Status s03 = FastText::BuildFromFile(&fast_text03, vectors_dir, 3);
EXPECT_EQ(s03, Status::OK());
std::shared_ptr<TensorTransform> to_vectors03 = std::make_shared<text::ToVectors>(fast_text03);
auto transform03 = Execute({to_vectors03});
Status status03 = transform03(token, &lookup_result);
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected);
EXPECT_TRUE(status03.IsOk());
}
/// Feature: ToVectors
/// Description: test basic usage of ToVectors and the FastText with default parameter
/// Expectation: get correct MSTensor
TEST_F(MindDataTestExecute, TestToVectorsParamForFastText) {
MS_LOG(INFO) << "Doing MindDataTestExecute-TestToVectorsParamForFastText.";
std::shared_ptr<Tensor> de_tensor01;
Tensor::CreateScalar<std::string>("none", &de_tensor01);
auto token01 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor01));
std::shared_ptr<Tensor> de_tensor02;
Tensor::CreateScalar<std::string>("ok", &de_tensor02);
auto token02 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor02));
std::shared_ptr<Tensor> de_tensor03;
Tensor::CreateScalar<std::string>("OK", &de_tensor03);
auto token03 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor03));
mindspore::MSTensor lookup_result;
// Create expected output.
dsize_t dim = 6;
std::shared_ptr<Tensor> de_expected01;
std::vector<float> expected01 = {0, 0, 0, 0, 0, 0};
ASSERT_OK(Tensor::CreateFromVector(expected01, TensorShape({dim}), &de_expected01));
auto ms_expected01 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected01));
std::shared_ptr<Tensor> de_expected02;
std::vector<float> expected02 = {-1, -1, -1, -1, -1, -1};
ASSERT_OK(Tensor::CreateFromVector(expected02, TensorShape({dim}), &de_expected02));
auto ms_expected02 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected02));
std::shared_ptr<Tensor> de_expected03;
std::vector<float> expected03 = {0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411};
ASSERT_OK(Tensor::CreateFromVector(expected03, TensorShape({dim}), &de_expected03));
auto ms_expected03 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected03));
// Transform params.
std::string vectors_dir = "data/dataset/test_fast_text/fast_text.vec";
std::shared_ptr<FastText> fast_text;
Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
EXPECT_EQ(s, Status::OK());
std::shared_ptr<TensorTransform> to_vectors01 = std::make_shared<text::ToVectors>(fast_text);
auto transform01 = Execute({to_vectors01});
Status status01 = transform01(token01, &lookup_result);
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected01);
EXPECT_TRUE(status01.IsOk());
std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
std::shared_ptr<TensorTransform> to_vectors02 = std::make_shared<text::ToVectors>(fast_text, unknown_init);
auto transform02 = Execute({to_vectors02});
Status status02 = transform02(token01, &lookup_result);
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected02);
EXPECT_TRUE(status02.IsOk());
std::shared_ptr<TensorTransform> to_vectors03 = std::make_shared<text::ToVectors>(fast_text, unknown_init);
auto transform03 = Execute({to_vectors03});
Status status03 = transform03(token02, &lookup_result);
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected03);
EXPECT_TRUE(status03.IsOk());
std::shared_ptr<TensorTransform> to_vectors04 = std::make_shared<text::ToVectors>(fast_text, unknown_init, true);
auto transform04 = Execute({to_vectors04});
Status status04 = transform04(token03, &lookup_result);
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected03);
EXPECT_TRUE(status04.IsOk());
}
/// Feature: ToVectors
/// Description: test invalid parameter of ToVectors for FastText
/// Expectation: throw exception correctly
TEST_F(MindDataTestExecute, TestToVectorsWithInvalidParamForFastText) {
MS_LOG(INFO) << "Doing MindDataTestExecute-TestToVectorsWithInvalidParamForFastText.";
std::shared_ptr<Tensor> de_tensor;
Tensor::CreateScalar<std::string>("none", &de_tensor);
auto token = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor));
mindspore::MSTensor lookup_result;
// Transform params.
std::string vectors_dir = "data/dataset/test_fast_text/fast_text.vec";
std::shared_ptr<FastText> fast_text01;
Status s = FastText::BuildFromFile(&fast_text01, vectors_dir);
EXPECT_EQ(s, Status::OK());
std::vector<float> unknown_init = {-1, -1, -1, -1};
std::shared_ptr<TensorTransform> to_vectors01 = std::make_shared<text::ToVectors>(fast_text01, unknown_init);
auto transform01 = Execute({to_vectors01});
Status status01 = transform01(token, &lookup_result);
EXPECT_FALSE(status01.IsOk());
std::shared_ptr<FastText> fast_text02 = nullptr;
std::shared_ptr<TensorTransform> to_vectors02 = std::make_shared<text::ToVectors>(fast_text02);
auto transform02 = Execute({to_vectors02});
Status status02 = transform02(token, &lookup_result);
EXPECT_FALSE(status02.IsOk());
}
// Feature: DBToAmplitude
// Description: test DBToAmplitude in eager mode
// Expectation: the data is processed successfully

View File

@ -0,0 +1,7 @@
6 6
ok 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445718411
! 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709
this 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973
is 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603
my 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246
home 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923

View File

@ -0,0 +1,7 @@
6 6
ok 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445718411
! 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709
this 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973
is 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603
my 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246
home 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923

View File

@ -0,0 +1,7 @@
6 6
ok 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445718411
! 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709
this 0.15164 0.30177 -0.16763 0.17684 0.31719
is 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603
my 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246
home 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923

View File

@ -0,0 +1,7 @@
the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445718411
, 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709
. 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973
6 6
of 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603
to 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246
and 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923

View File

@ -0,0 +1,7 @@
ok
.
this
is
my
home
.

View File

@ -0,0 +1,7 @@
ok
!
This
iS
my
HOME
.

View File

@ -0,0 +1,237 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import numpy as np
import pytest
from mindspore import log
import mindspore.dataset as ds
import mindspore.dataset.text as text
import mindspore.dataset.text.transforms as T
DATASET_ROOT_PATH = "../data/dataset/test_fast_text/"
def test_fast_text_all_build_from_file_params():
"""
Feature: FastText
Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile
Expectation: output is equal to the expected value
"""
vectors = text.FastText.from_file(DATASET_ROOT_PATH + "fast_text.vec", max_vectors=100)
to_vectors = text.ToVectors(vectors)
data = ds.TextFileDataset(DATASET_ROOT_PATH + "words.txt", shuffle=False)
data = data.map(operations=to_vectors, input_columns=["text"])
ind = 0
res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
[0, 0, 0, 0, 0, 0],
[0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
[0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
[0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246],
[0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923],
[0, 0, 0, 0, 0, 0]]
print(data)
for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
res_array = np.array(res[ind], dtype=np.float32)
assert np.array_equal(res_array, d["text"]), ind
ind += 1
def test_fast_text_all_build_from_file_params_eager():
"""
Feature: FastText
Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile in eager mode
Expectation: output is equal to the expected value
"""
vectors = text.FastText.from_file(DATASET_ROOT_PATH + "fast_text.vec", max_vectors=4)
to_vectors = T.ToVectors(vectors)
result1 = to_vectors("ok")
result2 = to_vectors("!")
result3 = to_vectors("this")
result4 = to_vectors("is")
result5 = to_vectors("my")
result6 = to_vectors("home")
result7 = to_vectors("none")
res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
[0.013441, 0.23682, -0.16899, 0.40951, 0.63812, 0.47709],
[0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
[0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
[0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0]]
res_array = np.array(res, dtype=np.float32)
assert np.array_equal(result1, res_array[0])
assert np.array_equal(result2, res_array[1])
assert np.array_equal(result3, res_array[2])
assert np.array_equal(result4, res_array[3])
assert np.array_equal(result5, res_array[4])
assert np.array_equal(result6, res_array[5])
assert np.array_equal(result7, res_array[6])
def test_fast_text_all_to_vectors_params_eager():
"""
Feature: FastText
Description: test with all parameters which include `unk_init` and `lower_case_backup` in function ToVectors
in eager mode
Expectation: output is equal to the expected value
"""
vectors = text.FastText.from_file(DATASET_ROOT_PATH + "fast_text.vec", max_vectors=4)
my_unk = [-1, -1, -1, -1, -1, -1]
to_vectors = T.ToVectors(vectors, unk_init=my_unk, lower_case_backup=True)
result1 = to_vectors("Ok")
result2 = to_vectors("!")
result3 = to_vectors("This")
result4 = to_vectors("is")
result5 = to_vectors("my")
result6 = to_vectors("home")
result7 = to_vectors("none")
res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
[0.013441, 0.23682, -0.16899, 0.40951, 0.63812, 0.47709],
[0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
[0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
[-1, -1, -1, -1, -1, -1],
[-1, -1, -1, -1, -1, -1],
[-1, -1, -1, -1, -1, -1]]
res_array = np.array(res, dtype=np.float32)
assert np.array_equal(result1, res_array[0])
assert np.array_equal(result2, res_array[1])
assert np.array_equal(result3, res_array[2])
assert np.array_equal(result4, res_array[3])
assert np.array_equal(result5, res_array[4])
assert np.array_equal(result6, res_array[5])
assert np.array_equal(result7, res_array[6])
def test_fast_text_build_from_file():
"""
Feature: FastText
Description: test with only default parameter
Expectation: output is equal to the expected value
"""
vectors = text.FastText.from_file(DATASET_ROOT_PATH + "fast_text.vec")
to_vectors = text.ToVectors(vectors)
data = ds.TextFileDataset(DATASET_ROOT_PATH + "words.txt", shuffle=False)
data = data.map(operations=to_vectors, input_columns=["text"])
ind = 0
res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
[0, 0, 0, 0, 0, 0],
[0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
[0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
[0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246],
[0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923],
[0, 0, 0, 0, 0, 0]]
print(data)
for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
res_array = np.array(res[ind], dtype=np.float32)
assert np.array_equal(res_array, d["text"]), ind
ind += 1
def test_fast_text_build_from_file_eager():
"""
Feature: FastText
Description: test with only default parameter in eager mode
Expectation: output is equal to the expected value
"""
vectors = text.FastText.from_file(DATASET_ROOT_PATH + "fast_text.vec")
to_vectors = T.ToVectors(vectors)
result1 = to_vectors("ok")
result2 = to_vectors("!")
result3 = to_vectors("this")
result4 = to_vectors("is")
result5 = to_vectors("my")
result6 = to_vectors("home")
result7 = to_vectors("none")
res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
[0.013441, 0.23682, -0.16899, 0.40951, 0.63812, 0.47709],
[0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
[0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
[0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246],
[0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923],
[0, 0, 0, 0, 0, 0]]
res_array = np.array(res, dtype=np.float32)
assert np.array_equal(result1, res_array[0])
assert np.array_equal(result2, res_array[1])
assert np.array_equal(result3, res_array[2])
assert np.array_equal(result4, res_array[3])
assert np.array_equal(result5, res_array[4])
assert np.array_equal(result6, res_array[5])
assert np.array_equal(result7, res_array[6])
def test_fast_text_invalid_input():
"""
Feature: FastText
Description: test the validate function with invalid parameters
Expectation: output is equal to the expected error
"""
def test_invalid_input(test_name, file_path, error, error_msg, max_vectors=None, unk_init=None,
lower_case_backup=False, token="ok"):
log.info("Test Vectors with wrong input: {0}".format(test_name))
with pytest.raises(error) as error_info:
vectors = text.FastText.from_file(file_path, max_vectors=max_vectors)
to_vectors = T.ToVectors(vectors, unk_init=unk_init, lower_case_backup=lower_case_backup)
to_vectors(token)
assert error_msg in str(error_info.value)
test_invalid_input("Not all vectors have the same number of dimensions",
DATASET_ROOT_PATH + "fast_text_dim_different.vec", error=RuntimeError,
error_msg="all vectors must have the same number of dimensions, " \
"but got dim 5 while expecting 6")
test_invalid_input("the file is empty.", DATASET_ROOT_PATH + "fast_text_empty.vec",
error=RuntimeError, error_msg="invalid file, file is empty.")
test_invalid_input("the count of `unknown_init`'s element is different with word vector.",
DATASET_ROOT_PATH + "fast_text.vec",
error=RuntimeError,
error_msg="unk_init must be the same length as vectors, but got unk_init",
unk_init=[-1, -1])
test_invalid_input("The file not exist", DATASET_ROOT_PATH + "not_exist.vec", RuntimeError,
error_msg="FastText: invalid file")
test_invalid_input("The token is 1-dimensional", DATASET_ROOT_PATH + "fast_text_with_wrong_info.vec",
error=RuntimeError, error_msg="token with 1-dimensional vector.")
test_invalid_input("max_vectors parameter must be greater than 0", DATASET_ROOT_PATH + "fast_text.vec",
error=ValueError, error_msg="Input max_vectors is not within the required interval",
max_vectors=-1)
test_invalid_input("invalid max_vectors parameter type as a float", DATASET_ROOT_PATH + "fast_text.vec",
error=TypeError, error_msg="Argument max_vectors with value 1.0 is not of type [<class 'int'>],"
" but got <class 'float'>.", max_vectors=1.0)
test_invalid_input("invalid max_vectors parameter type as a string", DATASET_ROOT_PATH + "fast_text.vec",
error=TypeError, error_msg="Argument max_vectors with value 1 is not of type [<class 'int'>],"
" but got <class 'str'>.", max_vectors="1")
test_invalid_input("invalid token parameter type as a float", DATASET_ROOT_PATH + "fast_text.vec",
error=RuntimeError, error_msg="input tensor type should be string.", token=1.0)
test_invalid_input("invalid lower_case_backup parameter type as a string", DATASET_ROOT_PATH + "fast_text.vec",
error=TypeError, error_msg="Argument lower_case_backup with value True is " \
"not of type [<class 'bool'>],"
" but got <class 'str'>.", lower_case_backup="True")
test_invalid_input("invalid lower_case_backup parameter type as a string", DATASET_ROOT_PATH + "fast_text.vec",
error=TypeError, error_msg="Argument lower_case_backup with value True is " \
"not of type [<class 'bool'>],"
" but got <class 'str'>.", lower_case_backup="True")
test_invalid_input("the suffix of pre-training set must be `*.vec`", DATASET_ROOT_PATH + "fast_text.txt",
error=RuntimeError, error_msg="FastText: invalid file, can not find file '*.vec'")
if __name__ == '__main__':
test_fast_text_all_build_from_file_params()
test_fast_text_all_build_from_file_params_eager()
test_fast_text_all_to_vectors_params_eager()
test_fast_text_build_from_file()
test_fast_text_build_from_file_eager()
test_fast_text_invalid_input()