forked from mindspore-Ecosystem/mindspore
[fix] [assistant] [I3ZSQS] add new data operator FastText
This commit is contained in:
parent
baf0f479c1
commit
5ca98a494a
|
@ -19,6 +19,7 @@
|
|||
|
||||
#include "minddata/dataset/api/python/pybind_register.h"
|
||||
#include "minddata/dataset/include/dataset/constants.h"
|
||||
#include "minddata/dataset/text/fast_text.h"
|
||||
#include "minddata/dataset/text/sentence_piece_vocab.h"
|
||||
#include "minddata/dataset/text/vectors.h"
|
||||
#include "minddata/dataset/text/vocab.h"
|
||||
|
@ -88,6 +89,16 @@ PYBIND_REGISTER(SentencePieceModel, 0, ([](const py::module *m) {
|
|||
.export_values();
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(FastText, 1, ([](const py::module *m) {
|
||||
(void)py::class_<FastText, Vectors, std::shared_ptr<FastText>>(*m, "FastText")
|
||||
.def(py::init<>())
|
||||
.def_static("from_file", [](const std::string &path, int32_t max_vectors) {
|
||||
std::shared_ptr<FastText> fast_text;
|
||||
THROW_IF_ERROR(FastText::BuildFromFile(&fast_text, path, max_vectors));
|
||||
return fast_text;
|
||||
});
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(Vectors, 0, ([](const py::module *m) {
|
||||
(void)py::class_<Vectors, std::shared_ptr<Vectors>>(*m, "Vectors")
|
||||
.def(py::init<>())
|
||||
|
|
|
@ -630,7 +630,7 @@ class MS_API ToNumber final : public TensorTransform {
|
|||
};
|
||||
|
||||
/// \brief Look up a token into an vector according to the input Vectors table.
|
||||
class ToVectors final : public TensorTransform {
|
||||
class MS_API ToVectors final : public TensorTransform {
|
||||
public:
|
||||
/// \brief Constructor.
|
||||
/// \param[in] vectors A Vectors object.
|
||||
|
|
|
@ -4,9 +4,10 @@ add_subdirectory(kernels)
|
|||
file(GLOB _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
|
||||
set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
|
||||
add_library(text OBJECT
|
||||
fast_text.cc
|
||||
sentence_piece_vocab.cc
|
||||
vectors.cc
|
||||
vocab.cc
|
||||
sentence_piece_vocab.cc
|
||||
)
|
||||
|
||||
add_dependencies(text text-kernels)
|
|
@ -0,0 +1,50 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "minddata/dataset/text/fast_text.h"
|
||||
|
||||
#include "utils/file_utils.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
FastText::FastText(const std::unordered_map<std::string, std::vector<float>> &map, int dim) : Vectors(map, dim) {}
|
||||
|
||||
Status CheckFastText(const std::string &file_path) {
|
||||
Path path = Path(file_path);
|
||||
if (path.Exists() && !path.IsDirectory()) {
|
||||
std::string basename = path.Basename();
|
||||
size_t dot = basename.rfind('.');
|
||||
std::string suffix = basename.substr(dot + 1);
|
||||
if (suffix != "vec") {
|
||||
RETURN_STATUS_UNEXPECTED("FastText: invalid file, can not find file '*.vec', but got: " + file_path);
|
||||
}
|
||||
return Status::OK();
|
||||
} else {
|
||||
RETURN_STATUS_UNEXPECTED("FastText: invalid file, failed to open FastText file.");
|
||||
}
|
||||
}
|
||||
|
||||
Status FastText::BuildFromFile(std::shared_ptr<FastText> *fast_text, const std::string &path, int32_t max_vectors) {
|
||||
RETURN_UNEXPECTED_IF_NULL(fast_text);
|
||||
RETURN_IF_NOT_OK(CheckFastText(path));
|
||||
std::unordered_map<std::string, std::vector<float>> map;
|
||||
int vector_dim = -1;
|
||||
RETURN_IF_NOT_OK(Load(path, max_vectors, &map, &vector_dim));
|
||||
*fast_text = std::make_shared<FastText>(std::move(map), vector_dim);
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,55 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_FAST_TEXT_H_
|
||||
#define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_FAST_TEXT_H_
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "minddata/dataset/core/tensor.h"
|
||||
#include "minddata/dataset/include/dataset/iterator.h"
|
||||
#include "minddata/dataset/text/vectors.h"
|
||||
#include "minddata/dataset/util/path.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
/// \brief Pre-train word vectors.
|
||||
class FastText : public Vectors {
|
||||
public:
|
||||
/// Constructor.
|
||||
FastText() = default;
|
||||
|
||||
/// Constructor.
|
||||
/// \param[in] map A map between string and vector.
|
||||
/// \param[in] dim Dimension of the vectors.
|
||||
FastText(const std::unordered_map<std::string, std::vector<float>> &map, int dim);
|
||||
|
||||
/// Destructor.
|
||||
~FastText() = default;
|
||||
|
||||
/// \brief Build Vectors from reading a pre-train vector file.
|
||||
/// \param[out] fast_text FastText object which contains the pre-train vectors.
|
||||
/// \param[in] path Path to the pre-trained word vector file. The suffix of set must be `*.vec`.
|
||||
/// \param[in] max_vectors This can be used to limit the number of pre-trained vectors loaded (default=0, no limit).
|
||||
static Status BuildFromFile(std::shared_ptr<FastText> *fast_text, const std::string &path, int32_t max_vectors = 0);
|
||||
};
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_FAST_TEXT_H_
|
|
@ -30,6 +30,8 @@ namespace dataset {
|
|||
class Vectors;
|
||||
class Vocab;
|
||||
class SentencePieceVocab;
|
||||
class Vectors;
|
||||
class Vocab;
|
||||
|
||||
// Transform operations for text
|
||||
namespace text {
|
||||
|
|
|
@ -28,13 +28,13 @@ import platform
|
|||
from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer, \
|
||||
TruncateSequencePair, ToNumber, SlidingWindow, SentencePieceTokenizer, PythonTokenizer, ToVectors
|
||||
from .utils import to_str, to_bytes, JiebaMode, Vocab, NormalizeForm, SentencePieceVocab, SentencePieceModel, \
|
||||
SPieceTokenizerOutType, SPieceTokenizerLoadType, Vectors
|
||||
SPieceTokenizerOutType, SPieceTokenizerLoadType, Vectors, FastText
|
||||
|
||||
__all__ = [
|
||||
"Lookup", "JiebaTokenizer", "UnicodeCharTokenizer", "Ngram",
|
||||
"to_str", "to_bytes", "Vocab", "WordpieceTokenizer", "TruncateSequencePair", "ToNumber",
|
||||
"PythonTokenizer", "SlidingWindow", "SentencePieceVocab", "SentencePieceTokenizer", "SPieceTokenizerOutType",
|
||||
"SentencePieceModel", "SPieceTokenizerLoadType", "JiebaMode", "NormalizeForm", "Vectors", "ToVectors"
|
||||
"SentencePieceModel", "SPieceTokenizerLoadType", "JiebaMode", "NormalizeForm", "Vectors", "ToVectors", "FastText"
|
||||
]
|
||||
|
||||
if platform.system().lower() != 'windows':
|
||||
|
|
|
@ -27,7 +27,7 @@ from .validators import check_from_file, check_from_list, check_from_dict, check
|
|||
check_from_file_vectors
|
||||
|
||||
__all__ = [
|
||||
"Vocab", "SentencePieceVocab", "to_str", "to_bytes", "Vectors"
|
||||
"Vocab", "SentencePieceVocab", "to_str", "to_bytes", "Vectors", "FastText"
|
||||
]
|
||||
|
||||
|
||||
|
@ -411,3 +411,30 @@ class Vectors(cde.Vectors):
|
|||
|
||||
max_vectors = max_vectors if max_vectors is not None else 0
|
||||
return super().from_file(file_path, max_vectors)
|
||||
|
||||
|
||||
class FastText(cde.FastText):
|
||||
"""
|
||||
FastText object that is used to map tokens into vectors.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
@check_from_file_vectors
|
||||
def from_file(cls, file_path, max_vectors=None):
|
||||
"""
|
||||
Build a FastText vector from a file.
|
||||
|
||||
Args:
|
||||
file_path (str): Path of the file that contains the vectors. The shuffix of pre-trained vector sets
|
||||
must be `*.vec`.
|
||||
max_vectors (int, optional): This can be used to limit the number of pre-trained vectors loaded.
|
||||
Most pre-trained vector sets are sorted in the descending order of word frequency. Thus, in
|
||||
situations where the entire set doesn’t fit in memory, or is not needed for another reason,
|
||||
passing max_vectors can limit the size of the loaded set (default=None, no limit).
|
||||
|
||||
Examples:
|
||||
>>> fast_text = text.FastText.from_file("/path/to/fast_text/file", max_vectors=None)
|
||||
"""
|
||||
|
||||
max_vectors = max_vectors if max_vectors is not None else 0
|
||||
return super().from_file(file_path, max_vectors)
|
||||
|
|
|
@ -23,11 +23,13 @@
|
|||
#include "minddata/dataset/include/dataset/datasets.h"
|
||||
#include "minddata/dataset/include/dataset/text.h"
|
||||
#include "minddata/dataset/include/dataset/transforms.h"
|
||||
#include "minddata/dataset/text/fast_text.h"
|
||||
#include "minddata/dataset/text/vectors.h"
|
||||
#include "minddata/dataset/text/vocab.h"
|
||||
|
||||
using namespace mindspore::dataset;
|
||||
using mindspore::Status;
|
||||
using mindspore::dataset::FastText;
|
||||
using mindspore::dataset::ShuffleMode;
|
||||
using mindspore::dataset::Tensor;
|
||||
using mindspore::dataset::Vectors;
|
||||
|
@ -3943,3 +3945,357 @@ TEST_F(MindDataTestPipeline, TestVectorsWithWrongInfoFile) {
|
|||
Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
|
||||
EXPECT_NE(s, Status::OK());
|
||||
}
|
||||
|
||||
/// Feature: FastText
|
||||
/// Description: test with default parameter in function BuildFromFile and function Lookup
|
||||
/// Expectation: return correct MSTensor which is equal to the expected
|
||||
TEST_F(MindDataTestPipeline, TestFastTextDefaultParam) {
|
||||
// Test with default parameter.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextDefaultParam.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
|
||||
std::shared_ptr<FastText> fast_text;
|
||||
Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(fast_text);
|
||||
EXPECT_NE(lookup, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({lookup}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, mindspore::MSTensor> row;
|
||||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
|
||||
uint64_t i = 0;
|
||||
std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
|
||||
{0, 0, 0, 0, 0, 0},
|
||||
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
|
||||
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
|
||||
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
|
||||
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
|
||||
{0, 0, 0, 0, 0, 0}};
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
MS_LOG(INFO) << ind.Shape();
|
||||
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
|
||||
TensorPtr de_expected_item;
|
||||
dsize_t dim = 6;
|
||||
ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
|
||||
mindspore::MSTensor ms_expected_item =
|
||||
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
|
||||
EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
|
||||
|
||||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 7);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
/// Feature: FastText
|
||||
/// Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile
|
||||
/// Expectation: return correct MSTensor which is equal to the expected
|
||||
TEST_F(MindDataTestPipeline, TestFastTextAllBuildfromfileParams) {
|
||||
// Test with two parameters.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextAllBuildfromfileParams.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
|
||||
std::shared_ptr<FastText> fast_text;
|
||||
Status s = FastText::BuildFromFile(&fast_text, vectors_dir, 100);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(fast_text);
|
||||
EXPECT_NE(lookup, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({lookup}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, mindspore::MSTensor> row;
|
||||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
|
||||
uint64_t i = 0;
|
||||
std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
|
||||
{0, 0, 0, 0, 0, 0},
|
||||
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
|
||||
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
|
||||
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
|
||||
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
|
||||
{0, 0, 0, 0, 0, 0}};
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
MS_LOG(INFO) << ind.Shape();
|
||||
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
|
||||
TensorPtr de_expected_item;
|
||||
dsize_t dim = 6;
|
||||
ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
|
||||
mindspore::MSTensor ms_expected_item =
|
||||
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
|
||||
EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
|
||||
|
||||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 7);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
/// Feature: FastText
|
||||
/// Description: test with all parameters in function BuildFromFile and `unknown_init` in function Lookup
|
||||
/// Expectation: return correct MSTensor which is equal to the expected
|
||||
TEST_F(MindDataTestPipeline, TestFastTextUnknownInit) {
|
||||
// Test with two parameters.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextUnknownInit.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
|
||||
std::shared_ptr<FastText> fast_text;
|
||||
Status s = FastText::BuildFromFile(&fast_text, vectors_dir, 100);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
|
||||
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(fast_text, unknown_init);
|
||||
EXPECT_NE(lookup, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({lookup}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, mindspore::MSTensor> row;
|
||||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
|
||||
uint64_t i = 0;
|
||||
std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
|
||||
{-1, -1, -1, -1, -1, -1},
|
||||
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
|
||||
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
|
||||
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
|
||||
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
|
||||
{-1, -1, -1, -1, -1, -1}};
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
MS_LOG(INFO) << ind.Shape();
|
||||
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
|
||||
TensorPtr de_expected_item;
|
||||
dsize_t dim = 6;
|
||||
ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
|
||||
mindspore::MSTensor ms_expected_item =
|
||||
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
|
||||
EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
|
||||
|
||||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 7);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
/// Feature: FastText
|
||||
/// Description: test with all parameters which include `path` and `max_vectors` in function BuildFromFile and `token`,
|
||||
/// `unknown_init` and `lower_case_backup` in function Lookup. But some tokens have some big letters
|
||||
/// Expectation: return correct MSTensor which is equal to the expected
|
||||
TEST_F(MindDataTestPipeline, TestFastTextAllParams) {
|
||||
// Test with all parameters.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextAllParams.";
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
|
||||
std::shared_ptr<FastText> fast_text;
|
||||
Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
|
||||
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(fast_text, unknown_init, true);
|
||||
EXPECT_NE(lookup, nullptr);
|
||||
|
||||
// Create Map operation on ds
|
||||
ds = ds->Map({lookup}, {"text"});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Create an iterator over the result of the above dataset
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(iter, nullptr);
|
||||
|
||||
// Iterate the dataset and get each row
|
||||
std::unordered_map<std::string, mindspore::MSTensor> row;
|
||||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
|
||||
uint64_t i = 0;
|
||||
std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
|
||||
{-1, -1, -1, -1, -1, -1},
|
||||
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
|
||||
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
|
||||
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
|
||||
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
|
||||
{-1, -1, -1, -1, -1, -1}};
|
||||
while (row.size() != 0) {
|
||||
auto ind = row["text"];
|
||||
MS_LOG(INFO) << ind.Shape();
|
||||
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
|
||||
TensorPtr de_expected_item;
|
||||
dsize_t dim = 6;
|
||||
ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
|
||||
mindspore::MSTensor ms_expected_item =
|
||||
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
|
||||
EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
|
||||
|
||||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
i++;
|
||||
}
|
||||
|
||||
EXPECT_EQ(i, 7);
|
||||
|
||||
// Manually terminate the pipeline
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
/// Feature: FastText
|
||||
/// Description: test with pre-vectors set that have the different dimension
|
||||
/// Expectation: throw correct error and message
|
||||
TEST_F(MindDataTestPipeline, TestFastTextDifferentDimension) {
|
||||
// Tokens don't have the same number of vectors.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextDifferentDimension.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fasttext_dim_different.vec";
|
||||
std::shared_ptr<FastText> fast_text;
|
||||
Status s = FastText::BuildFromFile(&fast_text, vectors_dir, 100);
|
||||
EXPECT_NE(s, Status::OK());
|
||||
}
|
||||
|
||||
/// Feature: FastText
|
||||
/// Description: test with the parameter max_vectors that is <= 0
|
||||
/// Expectation: throw correct error and message
|
||||
TEST_F(MindDataTestPipeline, TestFastTextMaxVectorsLessThanZero) {
|
||||
// Test with max_vectors <= 0.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextMaxVectorsLessThanZero.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
|
||||
std::shared_ptr<FastText> fast_text;
|
||||
Status s = FastText::BuildFromFile(&fast_text, vectors_dir, -1);
|
||||
EXPECT_NE(s, Status::OK());
|
||||
}
|
||||
|
||||
/// Feature: FastText
|
||||
/// Description: test with the pre-vectors file that is empty
|
||||
/// Expectation: throw correct error and message
|
||||
TEST_F(MindDataTestPipeline, TestFastTextWithEmptyFile) {
|
||||
// Read empty file.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithEmptyFile.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fasttext_empty.vec";
|
||||
std::shared_ptr<FastText> fast_text;
|
||||
Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
|
||||
EXPECT_NE(s, Status::OK());
|
||||
}
|
||||
|
||||
/// Feature: FastText
|
||||
/// Description: test with the pre-vectors file that is not exist
|
||||
/// Expectation: throw correct error and message
|
||||
TEST_F(MindDataTestPipeline, TestFastTextWithNotExistFile) {
|
||||
// Test with not exist file.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithNotExistFile.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
std::string vectors_dir = datasets_root_path_ + "/test_fast_text/no_fasttext.vec";
|
||||
std::shared_ptr<FastText> fast_text;
|
||||
Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
|
||||
EXPECT_NE(s, Status::OK());
|
||||
}
|
||||
|
||||
/// Feature: FastText
|
||||
/// Description: test with the pre-vectors set that has a situation that info-head is not the first line in the set
|
||||
/// Expectation: throw correct error and message
|
||||
TEST_F(MindDataTestPipeline, TestFastTextWithWrongInfoFile) {
|
||||
// wrong info.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithWrongInfoFile.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fasttext_with_wrong_info.vec";
|
||||
std::shared_ptr<FastText> fast_text;
|
||||
Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
|
||||
EXPECT_NE(s, Status::OK());
|
||||
}
|
||||
|
||||
/// Feature: FastText
|
||||
/// Description: test with the pre-vectors set that has a wrong suffix
|
||||
/// Expectation: throw correct error and message
|
||||
TEST_F(MindDataTestPipeline, TestFastTextWithWrongSuffix) {
|
||||
// wrong info.
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithWrongSuffix.";
|
||||
|
||||
// Create a TextFile dataset
|
||||
std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
|
||||
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.txt";
|
||||
std::shared_ptr<FastText> fast_text;
|
||||
Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
|
||||
EXPECT_NE(s, Status::OK());
|
||||
}
|
||||
|
|
|
@ -23,11 +23,13 @@
|
|||
#include "minddata/dataset/include/dataset/vision.h"
|
||||
#include "minddata/dataset/include/dataset/audio.h"
|
||||
#include "minddata/dataset/include/dataset/text.h"
|
||||
#include "minddata/dataset/text/fast_text.h"
|
||||
#include "minddata/dataset/text/vectors.h"
|
||||
#include "utils/log_adapter.h"
|
||||
|
||||
using namespace mindspore::dataset;
|
||||
using mindspore::LogStream;
|
||||
using mindspore::dataset::FastText;
|
||||
using mindspore::dataset::Vectors;
|
||||
using mindspore::ExceptionType::NoExceptionType;
|
||||
using mindspore::MsLogLevel::INFO;
|
||||
|
@ -1665,6 +1667,140 @@ TEST_F(MindDataTestExecute, TestToVectorsWithInvalidParam) {
|
|||
EXPECT_FALSE(status02.IsOk());
|
||||
}
|
||||
|
||||
/// Feature: FastText
|
||||
/// Description: test basic usage of FastText and the ToVectors with default parameter
|
||||
/// Expectation: get correct MSTensor
|
||||
TEST_F(MindDataTestExecute, TestFastTextParam) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestExecute-TestFastTextParam.";
|
||||
std::shared_ptr<Tensor> de_tensor;
|
||||
Tensor::CreateScalar<std::string>("ok", &de_tensor);
|
||||
auto token = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor));
|
||||
mindspore::MSTensor lookup_result;
|
||||
|
||||
// Create expected output.
|
||||
std::shared_ptr<Tensor> de_expected;
|
||||
std::vector<float> expected = {0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411};
|
||||
dsize_t dim = 6;
|
||||
ASSERT_OK(Tensor::CreateFromVector(expected, TensorShape({dim}), &de_expected));
|
||||
auto ms_expected = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected));
|
||||
|
||||
// Transform params.
|
||||
std::string vectors_dir = "data/dataset/test_fast_text/fast_text.vec";
|
||||
std::shared_ptr<FastText> fast_text01;
|
||||
Status s01 = FastText::BuildFromFile(&fast_text01, vectors_dir);
|
||||
EXPECT_EQ(s01, Status::OK());
|
||||
std::shared_ptr<TensorTransform> to_vectors01 = std::make_shared<text::ToVectors>(fast_text01);
|
||||
auto transform01 = Execute({to_vectors01});
|
||||
Status status01 = transform01(token, &lookup_result);
|
||||
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected);
|
||||
EXPECT_TRUE(status01.IsOk());
|
||||
|
||||
std::shared_ptr<FastText> fast_text02;
|
||||
Status s02 = FastText::BuildFromFile(&fast_text02, vectors_dir, 100);
|
||||
EXPECT_EQ(s02, Status::OK());
|
||||
std::shared_ptr<TensorTransform> to_vectors02 = std::make_shared<text::ToVectors>(fast_text02);
|
||||
auto transform02 = Execute({to_vectors02});
|
||||
Status status02 = transform02(token, &lookup_result);
|
||||
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected);
|
||||
EXPECT_TRUE(status02.IsOk());
|
||||
|
||||
std::shared_ptr<FastText> fast_text03;
|
||||
Status s03 = FastText::BuildFromFile(&fast_text03, vectors_dir, 3);
|
||||
EXPECT_EQ(s03, Status::OK());
|
||||
std::shared_ptr<TensorTransform> to_vectors03 = std::make_shared<text::ToVectors>(fast_text03);
|
||||
auto transform03 = Execute({to_vectors03});
|
||||
Status status03 = transform03(token, &lookup_result);
|
||||
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected);
|
||||
EXPECT_TRUE(status03.IsOk());
|
||||
}
|
||||
|
||||
/// Feature: ToVectors
|
||||
/// Description: test basic usage of ToVectors and the FastText with default parameter
|
||||
/// Expectation: get correct MSTensor
|
||||
TEST_F(MindDataTestExecute, TestToVectorsParamForFastText) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestExecute-TestToVectorsParamForFastText.";
|
||||
std::shared_ptr<Tensor> de_tensor01;
|
||||
Tensor::CreateScalar<std::string>("none", &de_tensor01);
|
||||
auto token01 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor01));
|
||||
std::shared_ptr<Tensor> de_tensor02;
|
||||
Tensor::CreateScalar<std::string>("ok", &de_tensor02);
|
||||
auto token02 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor02));
|
||||
std::shared_ptr<Tensor> de_tensor03;
|
||||
Tensor::CreateScalar<std::string>("OK", &de_tensor03);
|
||||
auto token03 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor03));
|
||||
mindspore::MSTensor lookup_result;
|
||||
|
||||
// Create expected output.
|
||||
dsize_t dim = 6;
|
||||
std::shared_ptr<Tensor> de_expected01;
|
||||
std::vector<float> expected01 = {0, 0, 0, 0, 0, 0};
|
||||
ASSERT_OK(Tensor::CreateFromVector(expected01, TensorShape({dim}), &de_expected01));
|
||||
auto ms_expected01 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected01));
|
||||
std::shared_ptr<Tensor> de_expected02;
|
||||
std::vector<float> expected02 = {-1, -1, -1, -1, -1, -1};
|
||||
ASSERT_OK(Tensor::CreateFromVector(expected02, TensorShape({dim}), &de_expected02));
|
||||
auto ms_expected02 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected02));
|
||||
std::shared_ptr<Tensor> de_expected03;
|
||||
std::vector<float> expected03 = {0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411};
|
||||
ASSERT_OK(Tensor::CreateFromVector(expected03, TensorShape({dim}), &de_expected03));
|
||||
auto ms_expected03 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected03));
|
||||
|
||||
// Transform params.
|
||||
std::string vectors_dir = "data/dataset/test_fast_text/fast_text.vec";
|
||||
std::shared_ptr<FastText> fast_text;
|
||||
Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
|
||||
std::shared_ptr<TensorTransform> to_vectors01 = std::make_shared<text::ToVectors>(fast_text);
|
||||
auto transform01 = Execute({to_vectors01});
|
||||
Status status01 = transform01(token01, &lookup_result);
|
||||
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected01);
|
||||
EXPECT_TRUE(status01.IsOk());
|
||||
std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
|
||||
std::shared_ptr<TensorTransform> to_vectors02 = std::make_shared<text::ToVectors>(fast_text, unknown_init);
|
||||
auto transform02 = Execute({to_vectors02});
|
||||
Status status02 = transform02(token01, &lookup_result);
|
||||
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected02);
|
||||
EXPECT_TRUE(status02.IsOk());
|
||||
std::shared_ptr<TensorTransform> to_vectors03 = std::make_shared<text::ToVectors>(fast_text, unknown_init);
|
||||
auto transform03 = Execute({to_vectors03});
|
||||
Status status03 = transform03(token02, &lookup_result);
|
||||
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected03);
|
||||
EXPECT_TRUE(status03.IsOk());
|
||||
std::shared_ptr<TensorTransform> to_vectors04 = std::make_shared<text::ToVectors>(fast_text, unknown_init, true);
|
||||
auto transform04 = Execute({to_vectors04});
|
||||
Status status04 = transform04(token03, &lookup_result);
|
||||
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected03);
|
||||
EXPECT_TRUE(status04.IsOk());
|
||||
}
|
||||
|
||||
/// Feature: ToVectors
|
||||
/// Description: test invalid parameter of ToVectors for FastText
|
||||
/// Expectation: throw exception correctly
|
||||
TEST_F(MindDataTestExecute, TestToVectorsWithInvalidParamForFastText) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestExecute-TestToVectorsWithInvalidParamForFastText.";
|
||||
std::shared_ptr<Tensor> de_tensor;
|
||||
Tensor::CreateScalar<std::string>("none", &de_tensor);
|
||||
auto token = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor));
|
||||
mindspore::MSTensor lookup_result;
|
||||
|
||||
// Transform params.
|
||||
std::string vectors_dir = "data/dataset/test_fast_text/fast_text.vec";
|
||||
std::shared_ptr<FastText> fast_text01;
|
||||
Status s = FastText::BuildFromFile(&fast_text01, vectors_dir);
|
||||
EXPECT_EQ(s, Status::OK());
|
||||
std::vector<float> unknown_init = {-1, -1, -1, -1};
|
||||
std::shared_ptr<TensorTransform> to_vectors01 = std::make_shared<text::ToVectors>(fast_text01, unknown_init);
|
||||
auto transform01 = Execute({to_vectors01});
|
||||
Status status01 = transform01(token, &lookup_result);
|
||||
EXPECT_FALSE(status01.IsOk());
|
||||
std::shared_ptr<FastText> fast_text02 = nullptr;
|
||||
std::shared_ptr<TensorTransform> to_vectors02 = std::make_shared<text::ToVectors>(fast_text02);
|
||||
auto transform02 = Execute({to_vectors02});
|
||||
Status status02 = transform02(token, &lookup_result);
|
||||
EXPECT_FALSE(status02.IsOk());
|
||||
}
|
||||
|
||||
// Feature: DBToAmplitude
|
||||
// Description: test DBToAmplitude in eager mode
|
||||
// Expectation: the data is processed successfully
|
||||
|
|
|
@ -0,0 +1,7 @@
|
|||
6 6
|
||||
ok 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445718411
|
||||
! 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709
|
||||
this 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973
|
||||
is 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603
|
||||
my 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246
|
||||
home 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923
|
|
@ -0,0 +1,7 @@
|
|||
6 6
|
||||
ok 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445718411
|
||||
! 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709
|
||||
this 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973
|
||||
is 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603
|
||||
my 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246
|
||||
home 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923
|
|
@ -0,0 +1,7 @@
|
|||
6 6
|
||||
ok 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445718411
|
||||
! 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709
|
||||
this 0.15164 0.30177 -0.16763 0.17684 0.31719
|
||||
is 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603
|
||||
my 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246
|
||||
home 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923
|
|
@ -0,0 +1,7 @@
|
|||
the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445718411
|
||||
, 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709
|
||||
. 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973
|
||||
6 6
|
||||
of 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603
|
||||
to 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246
|
||||
and 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923
|
|
@ -0,0 +1,7 @@
|
|||
ok
|
||||
.
|
||||
this
|
||||
is
|
||||
my
|
||||
home
|
||||
.
|
|
@ -0,0 +1,7 @@
|
|||
ok
|
||||
!
|
||||
This
|
||||
iS
|
||||
my
|
||||
HOME
|
||||
.
|
|
@ -0,0 +1,237 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from mindspore import log
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.text as text
|
||||
import mindspore.dataset.text.transforms as T
|
||||
|
||||
DATASET_ROOT_PATH = "../data/dataset/test_fast_text/"
|
||||
|
||||
|
||||
def test_fast_text_all_build_from_file_params():
|
||||
"""
|
||||
Feature: FastText
|
||||
Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile
|
||||
Expectation: output is equal to the expected value
|
||||
"""
|
||||
vectors = text.FastText.from_file(DATASET_ROOT_PATH + "fast_text.vec", max_vectors=100)
|
||||
to_vectors = text.ToVectors(vectors)
|
||||
data = ds.TextFileDataset(DATASET_ROOT_PATH + "words.txt", shuffle=False)
|
||||
data = data.map(operations=to_vectors, input_columns=["text"])
|
||||
ind = 0
|
||||
res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
|
||||
[0, 0, 0, 0, 0, 0],
|
||||
[0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
|
||||
[0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
|
||||
[0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246],
|
||||
[0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923],
|
||||
[0, 0, 0, 0, 0, 0]]
|
||||
print(data)
|
||||
for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
|
||||
res_array = np.array(res[ind], dtype=np.float32)
|
||||
assert np.array_equal(res_array, d["text"]), ind
|
||||
ind += 1
|
||||
|
||||
|
||||
def test_fast_text_all_build_from_file_params_eager():
|
||||
"""
|
||||
Feature: FastText
|
||||
Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile in eager mode
|
||||
Expectation: output is equal to the expected value
|
||||
"""
|
||||
vectors = text.FastText.from_file(DATASET_ROOT_PATH + "fast_text.vec", max_vectors=4)
|
||||
to_vectors = T.ToVectors(vectors)
|
||||
result1 = to_vectors("ok")
|
||||
result2 = to_vectors("!")
|
||||
result3 = to_vectors("this")
|
||||
result4 = to_vectors("is")
|
||||
result5 = to_vectors("my")
|
||||
result6 = to_vectors("home")
|
||||
result7 = to_vectors("none")
|
||||
res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
|
||||
[0.013441, 0.23682, -0.16899, 0.40951, 0.63812, 0.47709],
|
||||
[0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
|
||||
[0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
|
||||
[0, 0, 0, 0, 0, 0],
|
||||
[0, 0, 0, 0, 0, 0],
|
||||
[0, 0, 0, 0, 0, 0]]
|
||||
res_array = np.array(res, dtype=np.float32)
|
||||
|
||||
assert np.array_equal(result1, res_array[0])
|
||||
assert np.array_equal(result2, res_array[1])
|
||||
assert np.array_equal(result3, res_array[2])
|
||||
assert np.array_equal(result4, res_array[3])
|
||||
assert np.array_equal(result5, res_array[4])
|
||||
assert np.array_equal(result6, res_array[5])
|
||||
assert np.array_equal(result7, res_array[6])
|
||||
|
||||
|
||||
def test_fast_text_all_to_vectors_params_eager():
|
||||
"""
|
||||
Feature: FastText
|
||||
Description: test with all parameters which include `unk_init` and `lower_case_backup` in function ToVectors
|
||||
in eager mode
|
||||
Expectation: output is equal to the expected value
|
||||
"""
|
||||
vectors = text.FastText.from_file(DATASET_ROOT_PATH + "fast_text.vec", max_vectors=4)
|
||||
my_unk = [-1, -1, -1, -1, -1, -1]
|
||||
to_vectors = T.ToVectors(vectors, unk_init=my_unk, lower_case_backup=True)
|
||||
result1 = to_vectors("Ok")
|
||||
result2 = to_vectors("!")
|
||||
result3 = to_vectors("This")
|
||||
result4 = to_vectors("is")
|
||||
result5 = to_vectors("my")
|
||||
result6 = to_vectors("home")
|
||||
result7 = to_vectors("none")
|
||||
res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
|
||||
[0.013441, 0.23682, -0.16899, 0.40951, 0.63812, 0.47709],
|
||||
[0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
|
||||
[0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
|
||||
[-1, -1, -1, -1, -1, -1],
|
||||
[-1, -1, -1, -1, -1, -1],
|
||||
[-1, -1, -1, -1, -1, -1]]
|
||||
res_array = np.array(res, dtype=np.float32)
|
||||
|
||||
assert np.array_equal(result1, res_array[0])
|
||||
assert np.array_equal(result2, res_array[1])
|
||||
assert np.array_equal(result3, res_array[2])
|
||||
assert np.array_equal(result4, res_array[3])
|
||||
assert np.array_equal(result5, res_array[4])
|
||||
assert np.array_equal(result6, res_array[5])
|
||||
assert np.array_equal(result7, res_array[6])
|
||||
|
||||
|
||||
def test_fast_text_build_from_file():
|
||||
"""
|
||||
Feature: FastText
|
||||
Description: test with only default parameter
|
||||
Expectation: output is equal to the expected value
|
||||
"""
|
||||
vectors = text.FastText.from_file(DATASET_ROOT_PATH + "fast_text.vec")
|
||||
to_vectors = text.ToVectors(vectors)
|
||||
data = ds.TextFileDataset(DATASET_ROOT_PATH + "words.txt", shuffle=False)
|
||||
data = data.map(operations=to_vectors, input_columns=["text"])
|
||||
ind = 0
|
||||
res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
|
||||
[0, 0, 0, 0, 0, 0],
|
||||
[0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
|
||||
[0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
|
||||
[0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246],
|
||||
[0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923],
|
||||
[0, 0, 0, 0, 0, 0]]
|
||||
print(data)
|
||||
for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
|
||||
res_array = np.array(res[ind], dtype=np.float32)
|
||||
assert np.array_equal(res_array, d["text"]), ind
|
||||
ind += 1
|
||||
|
||||
|
||||
def test_fast_text_build_from_file_eager():
|
||||
"""
|
||||
Feature: FastText
|
||||
Description: test with only default parameter in eager mode
|
||||
Expectation: output is equal to the expected value
|
||||
"""
|
||||
vectors = text.FastText.from_file(DATASET_ROOT_PATH + "fast_text.vec")
|
||||
to_vectors = T.ToVectors(vectors)
|
||||
result1 = to_vectors("ok")
|
||||
result2 = to_vectors("!")
|
||||
result3 = to_vectors("this")
|
||||
result4 = to_vectors("is")
|
||||
result5 = to_vectors("my")
|
||||
result6 = to_vectors("home")
|
||||
result7 = to_vectors("none")
|
||||
res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
|
||||
[0.013441, 0.23682, -0.16899, 0.40951, 0.63812, 0.47709],
|
||||
[0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
|
||||
[0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
|
||||
[0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246],
|
||||
[0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923],
|
||||
[0, 0, 0, 0, 0, 0]]
|
||||
res_array = np.array(res, dtype=np.float32)
|
||||
|
||||
assert np.array_equal(result1, res_array[0])
|
||||
assert np.array_equal(result2, res_array[1])
|
||||
assert np.array_equal(result3, res_array[2])
|
||||
assert np.array_equal(result4, res_array[3])
|
||||
assert np.array_equal(result5, res_array[4])
|
||||
assert np.array_equal(result6, res_array[5])
|
||||
assert np.array_equal(result7, res_array[6])
|
||||
|
||||
|
||||
def test_fast_text_invalid_input():
|
||||
"""
|
||||
Feature: FastText
|
||||
Description: test the validate function with invalid parameters
|
||||
Expectation: output is equal to the expected error
|
||||
"""
|
||||
def test_invalid_input(test_name, file_path, error, error_msg, max_vectors=None, unk_init=None,
|
||||
lower_case_backup=False, token="ok"):
|
||||
log.info("Test Vectors with wrong input: {0}".format(test_name))
|
||||
with pytest.raises(error) as error_info:
|
||||
vectors = text.FastText.from_file(file_path, max_vectors=max_vectors)
|
||||
to_vectors = T.ToVectors(vectors, unk_init=unk_init, lower_case_backup=lower_case_backup)
|
||||
to_vectors(token)
|
||||
assert error_msg in str(error_info.value)
|
||||
|
||||
test_invalid_input("Not all vectors have the same number of dimensions",
|
||||
DATASET_ROOT_PATH + "fast_text_dim_different.vec", error=RuntimeError,
|
||||
error_msg="all vectors must have the same number of dimensions, " \
|
||||
"but got dim 5 while expecting 6")
|
||||
test_invalid_input("the file is empty.", DATASET_ROOT_PATH + "fast_text_empty.vec",
|
||||
error=RuntimeError, error_msg="invalid file, file is empty.")
|
||||
test_invalid_input("the count of `unknown_init`'s element is different with word vector.",
|
||||
DATASET_ROOT_PATH + "fast_text.vec",
|
||||
error=RuntimeError,
|
||||
error_msg="unk_init must be the same length as vectors, but got unk_init",
|
||||
unk_init=[-1, -1])
|
||||
test_invalid_input("The file not exist", DATASET_ROOT_PATH + "not_exist.vec", RuntimeError,
|
||||
error_msg="FastText: invalid file")
|
||||
test_invalid_input("The token is 1-dimensional", DATASET_ROOT_PATH + "fast_text_with_wrong_info.vec",
|
||||
error=RuntimeError, error_msg="token with 1-dimensional vector.")
|
||||
test_invalid_input("max_vectors parameter must be greater than 0", DATASET_ROOT_PATH + "fast_text.vec",
|
||||
error=ValueError, error_msg="Input max_vectors is not within the required interval",
|
||||
max_vectors=-1)
|
||||
test_invalid_input("invalid max_vectors parameter type as a float", DATASET_ROOT_PATH + "fast_text.vec",
|
||||
error=TypeError, error_msg="Argument max_vectors with value 1.0 is not of type [<class 'int'>],"
|
||||
" but got <class 'float'>.", max_vectors=1.0)
|
||||
test_invalid_input("invalid max_vectors parameter type as a string", DATASET_ROOT_PATH + "fast_text.vec",
|
||||
error=TypeError, error_msg="Argument max_vectors with value 1 is not of type [<class 'int'>],"
|
||||
" but got <class 'str'>.", max_vectors="1")
|
||||
test_invalid_input("invalid token parameter type as a float", DATASET_ROOT_PATH + "fast_text.vec",
|
||||
error=RuntimeError, error_msg="input tensor type should be string.", token=1.0)
|
||||
test_invalid_input("invalid lower_case_backup parameter type as a string", DATASET_ROOT_PATH + "fast_text.vec",
|
||||
error=TypeError, error_msg="Argument lower_case_backup with value True is " \
|
||||
"not of type [<class 'bool'>],"
|
||||
" but got <class 'str'>.", lower_case_backup="True")
|
||||
test_invalid_input("invalid lower_case_backup parameter type as a string", DATASET_ROOT_PATH + "fast_text.vec",
|
||||
error=TypeError, error_msg="Argument lower_case_backup with value True is " \
|
||||
"not of type [<class 'bool'>],"
|
||||
" but got <class 'str'>.", lower_case_backup="True")
|
||||
test_invalid_input("the suffix of pre-training set must be `*.vec`", DATASET_ROOT_PATH + "fast_text.txt",
|
||||
error=RuntimeError, error_msg="FastText: invalid file, can not find file '*.vec'")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_fast_text_all_build_from_file_params()
|
||||
test_fast_text_all_build_from_file_params_eager()
|
||||
test_fast_text_all_to_vectors_params_eager()
|
||||
test_fast_text_build_from_file()
|
||||
test_fast_text_build_from_file_eager()
|
||||
test_fast_text_invalid_input()
|
Loading…
Reference in New Issue