forked from mindspore-Ecosystem/mindspore
!22712 [assistant][ops] Add FastText
Merge pull request !22712 from 无言/FastText
This commit is contained in:
commit
0c6505db26
|
@ -19,6 +19,7 @@
|
||||||
|
|
||||||
#include "minddata/dataset/api/python/pybind_register.h"
|
#include "minddata/dataset/api/python/pybind_register.h"
|
||||||
#include "minddata/dataset/include/dataset/constants.h"
|
#include "minddata/dataset/include/dataset/constants.h"
|
||||||
|
#include "minddata/dataset/text/fast_text.h"
|
||||||
#include "minddata/dataset/text/sentence_piece_vocab.h"
|
#include "minddata/dataset/text/sentence_piece_vocab.h"
|
||||||
#include "minddata/dataset/text/vectors.h"
|
#include "minddata/dataset/text/vectors.h"
|
||||||
#include "minddata/dataset/text/vocab.h"
|
#include "minddata/dataset/text/vocab.h"
|
||||||
|
@ -88,6 +89,16 @@ PYBIND_REGISTER(SentencePieceModel, 0, ([](const py::module *m) {
|
||||||
.export_values();
|
.export_values();
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
PYBIND_REGISTER(FastText, 1, ([](const py::module *m) {
|
||||||
|
(void)py::class_<FastText, Vectors, std::shared_ptr<FastText>>(*m, "FastText")
|
||||||
|
.def(py::init<>())
|
||||||
|
.def_static("from_file", [](const std::string &path, int32_t max_vectors) {
|
||||||
|
std::shared_ptr<FastText> fast_text;
|
||||||
|
THROW_IF_ERROR(FastText::BuildFromFile(&fast_text, path, max_vectors));
|
||||||
|
return fast_text;
|
||||||
|
});
|
||||||
|
}));
|
||||||
|
|
||||||
PYBIND_REGISTER(Vectors, 0, ([](const py::module *m) {
|
PYBIND_REGISTER(Vectors, 0, ([](const py::module *m) {
|
||||||
(void)py::class_<Vectors, std::shared_ptr<Vectors>>(*m, "Vectors")
|
(void)py::class_<Vectors, std::shared_ptr<Vectors>>(*m, "Vectors")
|
||||||
.def(py::init<>())
|
.def(py::init<>())
|
||||||
|
|
|
@ -630,7 +630,7 @@ class MS_API ToNumber final : public TensorTransform {
|
||||||
};
|
};
|
||||||
|
|
||||||
/// \brief Look up a token into an vector according to the input Vectors table.
|
/// \brief Look up a token into an vector according to the input Vectors table.
|
||||||
class ToVectors final : public TensorTransform {
|
class MS_API ToVectors final : public TensorTransform {
|
||||||
public:
|
public:
|
||||||
/// \brief Constructor.
|
/// \brief Constructor.
|
||||||
/// \param[in] vectors A Vectors object.
|
/// \param[in] vectors A Vectors object.
|
||||||
|
|
|
@ -4,9 +4,10 @@ add_subdirectory(kernels)
|
||||||
file(GLOB _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
|
file(GLOB _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
|
||||||
set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
|
set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
|
||||||
add_library(text OBJECT
|
add_library(text OBJECT
|
||||||
|
fast_text.cc
|
||||||
|
sentence_piece_vocab.cc
|
||||||
vectors.cc
|
vectors.cc
|
||||||
vocab.cc
|
vocab.cc
|
||||||
sentence_piece_vocab.cc
|
|
||||||
)
|
)
|
||||||
|
|
||||||
add_dependencies(text text-kernels)
|
add_dependencies(text text-kernels)
|
|
@ -0,0 +1,50 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "minddata/dataset/text/fast_text.h"
|
||||||
|
|
||||||
|
#include "utils/file_utils.h"
|
||||||
|
|
||||||
|
namespace mindspore {
|
||||||
|
namespace dataset {
|
||||||
|
FastText::FastText(const std::unordered_map<std::string, std::vector<float>> &map, int dim) : Vectors(map, dim) {}
|
||||||
|
|
||||||
|
Status CheckFastText(const std::string &file_path) {
|
||||||
|
Path path = Path(file_path);
|
||||||
|
if (path.Exists() && !path.IsDirectory()) {
|
||||||
|
std::string basename = path.Basename();
|
||||||
|
size_t dot = basename.rfind('.');
|
||||||
|
std::string suffix = basename.substr(dot + 1);
|
||||||
|
if (suffix != "vec") {
|
||||||
|
RETURN_STATUS_UNEXPECTED("FastText: invalid file, can not find file '*.vec', but got: " + file_path);
|
||||||
|
}
|
||||||
|
return Status::OK();
|
||||||
|
} else {
|
||||||
|
RETURN_STATUS_UNEXPECTED("FastText: invalid file, failed to open FastText file.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Status FastText::BuildFromFile(std::shared_ptr<FastText> *fast_text, const std::string &path, int32_t max_vectors) {
|
||||||
|
RETURN_UNEXPECTED_IF_NULL(fast_text);
|
||||||
|
RETURN_IF_NOT_OK(CheckFastText(path));
|
||||||
|
std::unordered_map<std::string, std::vector<float>> map;
|
||||||
|
int vector_dim = -1;
|
||||||
|
RETURN_IF_NOT_OK(Load(path, max_vectors, &map, &vector_dim));
|
||||||
|
*fast_text = std::make_shared<FastText>(std::move(map), vector_dim);
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
} // namespace dataset
|
||||||
|
} // namespace mindspore
|
|
@ -0,0 +1,55 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_FAST_TEXT_H_
|
||||||
|
#define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_FAST_TEXT_H_
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
#include <unordered_map>
|
||||||
|
#include <utility>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "minddata/dataset/core/tensor.h"
|
||||||
|
#include "minddata/dataset/include/dataset/iterator.h"
|
||||||
|
#include "minddata/dataset/text/vectors.h"
|
||||||
|
#include "minddata/dataset/util/path.h"
|
||||||
|
|
||||||
|
namespace mindspore {
|
||||||
|
namespace dataset {
|
||||||
|
/// \brief Pre-train word vectors.
|
||||||
|
class FastText : public Vectors {
|
||||||
|
public:
|
||||||
|
/// Constructor.
|
||||||
|
FastText() = default;
|
||||||
|
|
||||||
|
/// Constructor.
|
||||||
|
/// \param[in] map A map between string and vector.
|
||||||
|
/// \param[in] dim Dimension of the vectors.
|
||||||
|
FastText(const std::unordered_map<std::string, std::vector<float>> &map, int dim);
|
||||||
|
|
||||||
|
/// Destructor.
|
||||||
|
~FastText() = default;
|
||||||
|
|
||||||
|
/// \brief Build Vectors from reading a pre-train vector file.
|
||||||
|
/// \param[out] fast_text FastText object which contains the pre-train vectors.
|
||||||
|
/// \param[in] path Path to the pre-trained word vector file. The suffix of set must be `*.vec`.
|
||||||
|
/// \param[in] max_vectors This can be used to limit the number of pre-trained vectors loaded (default=0, no limit).
|
||||||
|
static Status BuildFromFile(std::shared_ptr<FastText> *fast_text, const std::string &path, int32_t max_vectors = 0);
|
||||||
|
};
|
||||||
|
} // namespace dataset
|
||||||
|
} // namespace mindspore
|
||||||
|
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_FAST_TEXT_H_
|
|
@ -30,6 +30,8 @@ namespace dataset {
|
||||||
class Vectors;
|
class Vectors;
|
||||||
class Vocab;
|
class Vocab;
|
||||||
class SentencePieceVocab;
|
class SentencePieceVocab;
|
||||||
|
class Vectors;
|
||||||
|
class Vocab;
|
||||||
|
|
||||||
// Transform operations for text
|
// Transform operations for text
|
||||||
namespace text {
|
namespace text {
|
||||||
|
|
|
@ -28,13 +28,13 @@ import platform
|
||||||
from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer, \
|
from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer, \
|
||||||
TruncateSequencePair, ToNumber, SlidingWindow, SentencePieceTokenizer, PythonTokenizer, ToVectors
|
TruncateSequencePair, ToNumber, SlidingWindow, SentencePieceTokenizer, PythonTokenizer, ToVectors
|
||||||
from .utils import to_str, to_bytes, JiebaMode, Vocab, NormalizeForm, SentencePieceVocab, SentencePieceModel, \
|
from .utils import to_str, to_bytes, JiebaMode, Vocab, NormalizeForm, SentencePieceVocab, SentencePieceModel, \
|
||||||
SPieceTokenizerOutType, SPieceTokenizerLoadType, Vectors
|
SPieceTokenizerOutType, SPieceTokenizerLoadType, Vectors, FastText
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"Lookup", "JiebaTokenizer", "UnicodeCharTokenizer", "Ngram",
|
"Lookup", "JiebaTokenizer", "UnicodeCharTokenizer", "Ngram",
|
||||||
"to_str", "to_bytes", "Vocab", "WordpieceTokenizer", "TruncateSequencePair", "ToNumber",
|
"to_str", "to_bytes", "Vocab", "WordpieceTokenizer", "TruncateSequencePair", "ToNumber",
|
||||||
"PythonTokenizer", "SlidingWindow", "SentencePieceVocab", "SentencePieceTokenizer", "SPieceTokenizerOutType",
|
"PythonTokenizer", "SlidingWindow", "SentencePieceVocab", "SentencePieceTokenizer", "SPieceTokenizerOutType",
|
||||||
"SentencePieceModel", "SPieceTokenizerLoadType", "JiebaMode", "NormalizeForm", "Vectors", "ToVectors"
|
"SentencePieceModel", "SPieceTokenizerLoadType", "JiebaMode", "NormalizeForm", "Vectors", "ToVectors", "FastText"
|
||||||
]
|
]
|
||||||
|
|
||||||
if platform.system().lower() != 'windows':
|
if platform.system().lower() != 'windows':
|
||||||
|
|
|
@ -27,7 +27,7 @@ from .validators import check_from_file, check_from_list, check_from_dict, check
|
||||||
check_from_file_vectors
|
check_from_file_vectors
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"Vocab", "SentencePieceVocab", "to_str", "to_bytes", "Vectors"
|
"Vocab", "SentencePieceVocab", "to_str", "to_bytes", "Vectors", "FastText"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -411,3 +411,30 @@ class Vectors(cde.Vectors):
|
||||||
|
|
||||||
max_vectors = max_vectors if max_vectors is not None else 0
|
max_vectors = max_vectors if max_vectors is not None else 0
|
||||||
return super().from_file(file_path, max_vectors)
|
return super().from_file(file_path, max_vectors)
|
||||||
|
|
||||||
|
|
||||||
|
class FastText(cde.FastText):
|
||||||
|
"""
|
||||||
|
FastText object that is used to map tokens into vectors.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@check_from_file_vectors
|
||||||
|
def from_file(cls, file_path, max_vectors=None):
|
||||||
|
"""
|
||||||
|
Build a FastText vector from a file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path (str): Path of the file that contains the vectors. The shuffix of pre-trained vector sets
|
||||||
|
must be `*.vec`.
|
||||||
|
max_vectors (int, optional): This can be used to limit the number of pre-trained vectors loaded.
|
||||||
|
Most pre-trained vector sets are sorted in the descending order of word frequency. Thus, in
|
||||||
|
situations where the entire set doesn’t fit in memory, or is not needed for another reason,
|
||||||
|
passing max_vectors can limit the size of the loaded set (default=None, no limit).
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> fast_text = text.FastText.from_file("/path/to/fast_text/file", max_vectors=None)
|
||||||
|
"""
|
||||||
|
|
||||||
|
max_vectors = max_vectors if max_vectors is not None else 0
|
||||||
|
return super().from_file(file_path, max_vectors)
|
||||||
|
|
|
@ -23,11 +23,13 @@
|
||||||
#include "minddata/dataset/include/dataset/datasets.h"
|
#include "minddata/dataset/include/dataset/datasets.h"
|
||||||
#include "minddata/dataset/include/dataset/text.h"
|
#include "minddata/dataset/include/dataset/text.h"
|
||||||
#include "minddata/dataset/include/dataset/transforms.h"
|
#include "minddata/dataset/include/dataset/transforms.h"
|
||||||
|
#include "minddata/dataset/text/fast_text.h"
|
||||||
#include "minddata/dataset/text/vectors.h"
|
#include "minddata/dataset/text/vectors.h"
|
||||||
#include "minddata/dataset/text/vocab.h"
|
#include "minddata/dataset/text/vocab.h"
|
||||||
|
|
||||||
using namespace mindspore::dataset;
|
using namespace mindspore::dataset;
|
||||||
using mindspore::Status;
|
using mindspore::Status;
|
||||||
|
using mindspore::dataset::FastText;
|
||||||
using mindspore::dataset::ShuffleMode;
|
using mindspore::dataset::ShuffleMode;
|
||||||
using mindspore::dataset::Tensor;
|
using mindspore::dataset::Tensor;
|
||||||
using mindspore::dataset::Vectors;
|
using mindspore::dataset::Vectors;
|
||||||
|
@ -3943,3 +3945,357 @@ TEST_F(MindDataTestPipeline, TestVectorsWithWrongInfoFile) {
|
||||||
Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
|
Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
|
||||||
EXPECT_NE(s, Status::OK());
|
EXPECT_NE(s, Status::OK());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Feature: FastText
|
||||||
|
/// Description: test with default parameter in function BuildFromFile and function Lookup
|
||||||
|
/// Expectation: return correct MSTensor which is equal to the expected
|
||||||
|
TEST_F(MindDataTestPipeline, TestFastTextDefaultParam) {
|
||||||
|
// Test with default parameter.
|
||||||
|
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextDefaultParam.";
|
||||||
|
|
||||||
|
// Create a TextFile dataset
|
||||||
|
std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
|
||||||
|
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||||
|
EXPECT_NE(ds, nullptr);
|
||||||
|
|
||||||
|
std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
|
||||||
|
std::shared_ptr<FastText> fast_text;
|
||||||
|
Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
|
||||||
|
EXPECT_EQ(s, Status::OK());
|
||||||
|
|
||||||
|
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(fast_text);
|
||||||
|
EXPECT_NE(lookup, nullptr);
|
||||||
|
|
||||||
|
// Create Map operation on ds
|
||||||
|
ds = ds->Map({lookup}, {"text"});
|
||||||
|
EXPECT_NE(ds, nullptr);
|
||||||
|
|
||||||
|
// Create an iterator over the result of the above dataset
|
||||||
|
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||||
|
EXPECT_NE(iter, nullptr);
|
||||||
|
|
||||||
|
// Iterate the dataset and get each row
|
||||||
|
std::unordered_map<std::string, mindspore::MSTensor> row;
|
||||||
|
ASSERT_OK(iter->GetNextRow(&row));
|
||||||
|
|
||||||
|
uint64_t i = 0;
|
||||||
|
std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
|
||||||
|
{0, 0, 0, 0, 0, 0},
|
||||||
|
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
|
||||||
|
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
|
||||||
|
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
|
||||||
|
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
|
||||||
|
{0, 0, 0, 0, 0, 0}};
|
||||||
|
while (row.size() != 0) {
|
||||||
|
auto ind = row["text"];
|
||||||
|
MS_LOG(INFO) << ind.Shape();
|
||||||
|
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
|
||||||
|
TensorPtr de_expected_item;
|
||||||
|
dsize_t dim = 6;
|
||||||
|
ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
|
||||||
|
mindspore::MSTensor ms_expected_item =
|
||||||
|
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
|
||||||
|
EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
|
||||||
|
|
||||||
|
ASSERT_OK(iter->GetNextRow(&row));
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
EXPECT_EQ(i, 7);
|
||||||
|
|
||||||
|
// Manually terminate the pipeline
|
||||||
|
iter->Stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Feature: FastText
|
||||||
|
/// Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile
|
||||||
|
/// Expectation: return correct MSTensor which is equal to the expected
|
||||||
|
TEST_F(MindDataTestPipeline, TestFastTextAllBuildfromfileParams) {
|
||||||
|
// Test with two parameters.
|
||||||
|
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextAllBuildfromfileParams.";
|
||||||
|
|
||||||
|
// Create a TextFile dataset
|
||||||
|
std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
|
||||||
|
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||||
|
EXPECT_NE(ds, nullptr);
|
||||||
|
|
||||||
|
std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
|
||||||
|
std::shared_ptr<FastText> fast_text;
|
||||||
|
Status s = FastText::BuildFromFile(&fast_text, vectors_dir, 100);
|
||||||
|
EXPECT_EQ(s, Status::OK());
|
||||||
|
|
||||||
|
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(fast_text);
|
||||||
|
EXPECT_NE(lookup, nullptr);
|
||||||
|
|
||||||
|
// Create Map operation on ds
|
||||||
|
ds = ds->Map({lookup}, {"text"});
|
||||||
|
EXPECT_NE(ds, nullptr);
|
||||||
|
|
||||||
|
// Create an iterator over the result of the above dataset
|
||||||
|
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||||
|
EXPECT_NE(iter, nullptr);
|
||||||
|
|
||||||
|
// Iterate the dataset and get each row
|
||||||
|
std::unordered_map<std::string, mindspore::MSTensor> row;
|
||||||
|
ASSERT_OK(iter->GetNextRow(&row));
|
||||||
|
|
||||||
|
uint64_t i = 0;
|
||||||
|
std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
|
||||||
|
{0, 0, 0, 0, 0, 0},
|
||||||
|
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
|
||||||
|
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
|
||||||
|
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
|
||||||
|
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
|
||||||
|
{0, 0, 0, 0, 0, 0}};
|
||||||
|
while (row.size() != 0) {
|
||||||
|
auto ind = row["text"];
|
||||||
|
MS_LOG(INFO) << ind.Shape();
|
||||||
|
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
|
||||||
|
TensorPtr de_expected_item;
|
||||||
|
dsize_t dim = 6;
|
||||||
|
ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
|
||||||
|
mindspore::MSTensor ms_expected_item =
|
||||||
|
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
|
||||||
|
EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
|
||||||
|
|
||||||
|
ASSERT_OK(iter->GetNextRow(&row));
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
EXPECT_EQ(i, 7);
|
||||||
|
|
||||||
|
// Manually terminate the pipeline
|
||||||
|
iter->Stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Feature: FastText
|
||||||
|
/// Description: test with all parameters in function BuildFromFile and `unknown_init` in function Lookup
|
||||||
|
/// Expectation: return correct MSTensor which is equal to the expected
|
||||||
|
TEST_F(MindDataTestPipeline, TestFastTextUnknownInit) {
|
||||||
|
// Test with two parameters.
|
||||||
|
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextUnknownInit.";
|
||||||
|
|
||||||
|
// Create a TextFile dataset
|
||||||
|
std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
|
||||||
|
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||||
|
EXPECT_NE(ds, nullptr);
|
||||||
|
|
||||||
|
std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
|
||||||
|
std::shared_ptr<FastText> fast_text;
|
||||||
|
Status s = FastText::BuildFromFile(&fast_text, vectors_dir, 100);
|
||||||
|
EXPECT_EQ(s, Status::OK());
|
||||||
|
|
||||||
|
std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
|
||||||
|
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(fast_text, unknown_init);
|
||||||
|
EXPECT_NE(lookup, nullptr);
|
||||||
|
|
||||||
|
// Create Map operation on ds
|
||||||
|
ds = ds->Map({lookup}, {"text"});
|
||||||
|
EXPECT_NE(ds, nullptr);
|
||||||
|
|
||||||
|
// Create an iterator over the result of the above dataset
|
||||||
|
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||||
|
EXPECT_NE(iter, nullptr);
|
||||||
|
|
||||||
|
// Iterate the dataset and get each row
|
||||||
|
std::unordered_map<std::string, mindspore::MSTensor> row;
|
||||||
|
ASSERT_OK(iter->GetNextRow(&row));
|
||||||
|
|
||||||
|
uint64_t i = 0;
|
||||||
|
std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
|
||||||
|
{-1, -1, -1, -1, -1, -1},
|
||||||
|
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
|
||||||
|
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
|
||||||
|
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
|
||||||
|
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
|
||||||
|
{-1, -1, -1, -1, -1, -1}};
|
||||||
|
while (row.size() != 0) {
|
||||||
|
auto ind = row["text"];
|
||||||
|
MS_LOG(INFO) << ind.Shape();
|
||||||
|
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
|
||||||
|
TensorPtr de_expected_item;
|
||||||
|
dsize_t dim = 6;
|
||||||
|
ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
|
||||||
|
mindspore::MSTensor ms_expected_item =
|
||||||
|
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
|
||||||
|
EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
|
||||||
|
|
||||||
|
ASSERT_OK(iter->GetNextRow(&row));
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
EXPECT_EQ(i, 7);
|
||||||
|
|
||||||
|
// Manually terminate the pipeline
|
||||||
|
iter->Stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Feature: FastText
|
||||||
|
/// Description: test with all parameters which include `path` and `max_vectors` in function BuildFromFile and `token`,
|
||||||
|
/// `unknown_init` and `lower_case_backup` in function Lookup. But some tokens have some big letters
|
||||||
|
/// Expectation: return correct MSTensor which is equal to the expected
|
||||||
|
TEST_F(MindDataTestPipeline, TestFastTextAllParams) {
|
||||||
|
// Test with all parameters.
|
||||||
|
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextAllParams.";
|
||||||
|
// Create a TextFile dataset
|
||||||
|
std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
|
||||||
|
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||||
|
EXPECT_NE(ds, nullptr);
|
||||||
|
|
||||||
|
std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
|
||||||
|
std::shared_ptr<FastText> fast_text;
|
||||||
|
Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
|
||||||
|
EXPECT_EQ(s, Status::OK());
|
||||||
|
|
||||||
|
std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
|
||||||
|
std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(fast_text, unknown_init, true);
|
||||||
|
EXPECT_NE(lookup, nullptr);
|
||||||
|
|
||||||
|
// Create Map operation on ds
|
||||||
|
ds = ds->Map({lookup}, {"text"});
|
||||||
|
EXPECT_NE(ds, nullptr);
|
||||||
|
|
||||||
|
// Create an iterator over the result of the above dataset
|
||||||
|
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||||
|
EXPECT_NE(iter, nullptr);
|
||||||
|
|
||||||
|
// Iterate the dataset and get each row
|
||||||
|
std::unordered_map<std::string, mindspore::MSTensor> row;
|
||||||
|
ASSERT_OK(iter->GetNextRow(&row));
|
||||||
|
|
||||||
|
uint64_t i = 0;
|
||||||
|
std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
|
||||||
|
{-1, -1, -1, -1, -1, -1},
|
||||||
|
{0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
|
||||||
|
{0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
|
||||||
|
{0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
|
||||||
|
{0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
|
||||||
|
{-1, -1, -1, -1, -1, -1}};
|
||||||
|
while (row.size() != 0) {
|
||||||
|
auto ind = row["text"];
|
||||||
|
MS_LOG(INFO) << ind.Shape();
|
||||||
|
TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
|
||||||
|
TensorPtr de_expected_item;
|
||||||
|
dsize_t dim = 6;
|
||||||
|
ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
|
||||||
|
mindspore::MSTensor ms_expected_item =
|
||||||
|
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
|
||||||
|
EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
|
||||||
|
|
||||||
|
ASSERT_OK(iter->GetNextRow(&row));
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
EXPECT_EQ(i, 7);
|
||||||
|
|
||||||
|
// Manually terminate the pipeline
|
||||||
|
iter->Stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Feature: FastText
|
||||||
|
/// Description: test with pre-vectors set that have the different dimension
|
||||||
|
/// Expectation: throw correct error and message
|
||||||
|
TEST_F(MindDataTestPipeline, TestFastTextDifferentDimension) {
|
||||||
|
// Tokens don't have the same number of vectors.
|
||||||
|
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextDifferentDimension.";
|
||||||
|
|
||||||
|
// Create a TextFile dataset
|
||||||
|
std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
|
||||||
|
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||||
|
EXPECT_NE(ds, nullptr);
|
||||||
|
|
||||||
|
std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fasttext_dim_different.vec";
|
||||||
|
std::shared_ptr<FastText> fast_text;
|
||||||
|
Status s = FastText::BuildFromFile(&fast_text, vectors_dir, 100);
|
||||||
|
EXPECT_NE(s, Status::OK());
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Feature: FastText
|
||||||
|
/// Description: test with the parameter max_vectors that is <= 0
|
||||||
|
/// Expectation: throw correct error and message
|
||||||
|
TEST_F(MindDataTestPipeline, TestFastTextMaxVectorsLessThanZero) {
|
||||||
|
// Test with max_vectors <= 0.
|
||||||
|
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextMaxVectorsLessThanZero.";
|
||||||
|
|
||||||
|
// Create a TextFile dataset
|
||||||
|
std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
|
||||||
|
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||||
|
EXPECT_NE(ds, nullptr);
|
||||||
|
|
||||||
|
std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
|
||||||
|
std::shared_ptr<FastText> fast_text;
|
||||||
|
Status s = FastText::BuildFromFile(&fast_text, vectors_dir, -1);
|
||||||
|
EXPECT_NE(s, Status::OK());
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Feature: FastText
|
||||||
|
/// Description: test with the pre-vectors file that is empty
|
||||||
|
/// Expectation: throw correct error and message
|
||||||
|
TEST_F(MindDataTestPipeline, TestFastTextWithEmptyFile) {
|
||||||
|
// Read empty file.
|
||||||
|
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithEmptyFile.";
|
||||||
|
|
||||||
|
// Create a TextFile dataset
|
||||||
|
std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
|
||||||
|
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||||
|
EXPECT_NE(ds, nullptr);
|
||||||
|
|
||||||
|
std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fasttext_empty.vec";
|
||||||
|
std::shared_ptr<FastText> fast_text;
|
||||||
|
Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
|
||||||
|
EXPECT_NE(s, Status::OK());
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Feature: FastText
|
||||||
|
/// Description: test with the pre-vectors file that is not exist
|
||||||
|
/// Expectation: throw correct error and message
|
||||||
|
TEST_F(MindDataTestPipeline, TestFastTextWithNotExistFile) {
|
||||||
|
// Test with not exist file.
|
||||||
|
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithNotExistFile.";
|
||||||
|
|
||||||
|
// Create a TextFile dataset
|
||||||
|
std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
|
||||||
|
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||||
|
EXPECT_NE(ds, nullptr);
|
||||||
|
|
||||||
|
std::string vectors_dir = datasets_root_path_ + "/test_fast_text/no_fasttext.vec";
|
||||||
|
std::shared_ptr<FastText> fast_text;
|
||||||
|
Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
|
||||||
|
EXPECT_NE(s, Status::OK());
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Feature: FastText
|
||||||
|
/// Description: test with the pre-vectors set that has a situation that info-head is not the first line in the set
|
||||||
|
/// Expectation: throw correct error and message
|
||||||
|
TEST_F(MindDataTestPipeline, TestFastTextWithWrongInfoFile) {
|
||||||
|
// wrong info.
|
||||||
|
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithWrongInfoFile.";
|
||||||
|
|
||||||
|
// Create a TextFile dataset
|
||||||
|
std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
|
||||||
|
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||||
|
EXPECT_NE(ds, nullptr);
|
||||||
|
|
||||||
|
std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fasttext_with_wrong_info.vec";
|
||||||
|
std::shared_ptr<FastText> fast_text;
|
||||||
|
Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
|
||||||
|
EXPECT_NE(s, Status::OK());
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Feature: FastText
|
||||||
|
/// Description: test with the pre-vectors set that has a wrong suffix
|
||||||
|
/// Expectation: throw correct error and message
|
||||||
|
TEST_F(MindDataTestPipeline, TestFastTextWithWrongSuffix) {
|
||||||
|
// wrong info.
|
||||||
|
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithWrongSuffix.";
|
||||||
|
|
||||||
|
// Create a TextFile dataset
|
||||||
|
std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
|
||||||
|
std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
|
||||||
|
EXPECT_NE(ds, nullptr);
|
||||||
|
|
||||||
|
std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.txt";
|
||||||
|
std::shared_ptr<FastText> fast_text;
|
||||||
|
Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
|
||||||
|
EXPECT_NE(s, Status::OK());
|
||||||
|
}
|
||||||
|
|
|
@ -23,11 +23,13 @@
|
||||||
#include "minddata/dataset/include/dataset/vision.h"
|
#include "minddata/dataset/include/dataset/vision.h"
|
||||||
#include "minddata/dataset/include/dataset/audio.h"
|
#include "minddata/dataset/include/dataset/audio.h"
|
||||||
#include "minddata/dataset/include/dataset/text.h"
|
#include "minddata/dataset/include/dataset/text.h"
|
||||||
|
#include "minddata/dataset/text/fast_text.h"
|
||||||
#include "minddata/dataset/text/vectors.h"
|
#include "minddata/dataset/text/vectors.h"
|
||||||
#include "utils/log_adapter.h"
|
#include "utils/log_adapter.h"
|
||||||
|
|
||||||
using namespace mindspore::dataset;
|
using namespace mindspore::dataset;
|
||||||
using mindspore::LogStream;
|
using mindspore::LogStream;
|
||||||
|
using mindspore::dataset::FastText;
|
||||||
using mindspore::dataset::Vectors;
|
using mindspore::dataset::Vectors;
|
||||||
using mindspore::ExceptionType::NoExceptionType;
|
using mindspore::ExceptionType::NoExceptionType;
|
||||||
using mindspore::MsLogLevel::INFO;
|
using mindspore::MsLogLevel::INFO;
|
||||||
|
@ -1665,6 +1667,140 @@ TEST_F(MindDataTestExecute, TestToVectorsWithInvalidParam) {
|
||||||
EXPECT_FALSE(status02.IsOk());
|
EXPECT_FALSE(status02.IsOk());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Feature: FastText
|
||||||
|
/// Description: test basic usage of FastText and the ToVectors with default parameter
|
||||||
|
/// Expectation: get correct MSTensor
|
||||||
|
TEST_F(MindDataTestExecute, TestFastTextParam) {
|
||||||
|
MS_LOG(INFO) << "Doing MindDataTestExecute-TestFastTextParam.";
|
||||||
|
std::shared_ptr<Tensor> de_tensor;
|
||||||
|
Tensor::CreateScalar<std::string>("ok", &de_tensor);
|
||||||
|
auto token = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor));
|
||||||
|
mindspore::MSTensor lookup_result;
|
||||||
|
|
||||||
|
// Create expected output.
|
||||||
|
std::shared_ptr<Tensor> de_expected;
|
||||||
|
std::vector<float> expected = {0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411};
|
||||||
|
dsize_t dim = 6;
|
||||||
|
ASSERT_OK(Tensor::CreateFromVector(expected, TensorShape({dim}), &de_expected));
|
||||||
|
auto ms_expected = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected));
|
||||||
|
|
||||||
|
// Transform params.
|
||||||
|
std::string vectors_dir = "data/dataset/test_fast_text/fast_text.vec";
|
||||||
|
std::shared_ptr<FastText> fast_text01;
|
||||||
|
Status s01 = FastText::BuildFromFile(&fast_text01, vectors_dir);
|
||||||
|
EXPECT_EQ(s01, Status::OK());
|
||||||
|
std::shared_ptr<TensorTransform> to_vectors01 = std::make_shared<text::ToVectors>(fast_text01);
|
||||||
|
auto transform01 = Execute({to_vectors01});
|
||||||
|
Status status01 = transform01(token, &lookup_result);
|
||||||
|
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected);
|
||||||
|
EXPECT_TRUE(status01.IsOk());
|
||||||
|
|
||||||
|
std::shared_ptr<FastText> fast_text02;
|
||||||
|
Status s02 = FastText::BuildFromFile(&fast_text02, vectors_dir, 100);
|
||||||
|
EXPECT_EQ(s02, Status::OK());
|
||||||
|
std::shared_ptr<TensorTransform> to_vectors02 = std::make_shared<text::ToVectors>(fast_text02);
|
||||||
|
auto transform02 = Execute({to_vectors02});
|
||||||
|
Status status02 = transform02(token, &lookup_result);
|
||||||
|
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected);
|
||||||
|
EXPECT_TRUE(status02.IsOk());
|
||||||
|
|
||||||
|
std::shared_ptr<FastText> fast_text03;
|
||||||
|
Status s03 = FastText::BuildFromFile(&fast_text03, vectors_dir, 3);
|
||||||
|
EXPECT_EQ(s03, Status::OK());
|
||||||
|
std::shared_ptr<TensorTransform> to_vectors03 = std::make_shared<text::ToVectors>(fast_text03);
|
||||||
|
auto transform03 = Execute({to_vectors03});
|
||||||
|
Status status03 = transform03(token, &lookup_result);
|
||||||
|
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected);
|
||||||
|
EXPECT_TRUE(status03.IsOk());
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Feature: ToVectors
|
||||||
|
/// Description: test basic usage of ToVectors and the FastText with default parameter
|
||||||
|
/// Expectation: get correct MSTensor
|
||||||
|
TEST_F(MindDataTestExecute, TestToVectorsParamForFastText) {
|
||||||
|
MS_LOG(INFO) << "Doing MindDataTestExecute-TestToVectorsParamForFastText.";
|
||||||
|
std::shared_ptr<Tensor> de_tensor01;
|
||||||
|
Tensor::CreateScalar<std::string>("none", &de_tensor01);
|
||||||
|
auto token01 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor01));
|
||||||
|
std::shared_ptr<Tensor> de_tensor02;
|
||||||
|
Tensor::CreateScalar<std::string>("ok", &de_tensor02);
|
||||||
|
auto token02 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor02));
|
||||||
|
std::shared_ptr<Tensor> de_tensor03;
|
||||||
|
Tensor::CreateScalar<std::string>("OK", &de_tensor03);
|
||||||
|
auto token03 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor03));
|
||||||
|
mindspore::MSTensor lookup_result;
|
||||||
|
|
||||||
|
// Create expected output.
|
||||||
|
dsize_t dim = 6;
|
||||||
|
std::shared_ptr<Tensor> de_expected01;
|
||||||
|
std::vector<float> expected01 = {0, 0, 0, 0, 0, 0};
|
||||||
|
ASSERT_OK(Tensor::CreateFromVector(expected01, TensorShape({dim}), &de_expected01));
|
||||||
|
auto ms_expected01 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected01));
|
||||||
|
std::shared_ptr<Tensor> de_expected02;
|
||||||
|
std::vector<float> expected02 = {-1, -1, -1, -1, -1, -1};
|
||||||
|
ASSERT_OK(Tensor::CreateFromVector(expected02, TensorShape({dim}), &de_expected02));
|
||||||
|
auto ms_expected02 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected02));
|
||||||
|
std::shared_ptr<Tensor> de_expected03;
|
||||||
|
std::vector<float> expected03 = {0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411};
|
||||||
|
ASSERT_OK(Tensor::CreateFromVector(expected03, TensorShape({dim}), &de_expected03));
|
||||||
|
auto ms_expected03 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected03));
|
||||||
|
|
||||||
|
// Transform params.
|
||||||
|
std::string vectors_dir = "data/dataset/test_fast_text/fast_text.vec";
|
||||||
|
std::shared_ptr<FastText> fast_text;
|
||||||
|
Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
|
||||||
|
EXPECT_EQ(s, Status::OK());
|
||||||
|
|
||||||
|
std::shared_ptr<TensorTransform> to_vectors01 = std::make_shared<text::ToVectors>(fast_text);
|
||||||
|
auto transform01 = Execute({to_vectors01});
|
||||||
|
Status status01 = transform01(token01, &lookup_result);
|
||||||
|
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected01);
|
||||||
|
EXPECT_TRUE(status01.IsOk());
|
||||||
|
std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
|
||||||
|
std::shared_ptr<TensorTransform> to_vectors02 = std::make_shared<text::ToVectors>(fast_text, unknown_init);
|
||||||
|
auto transform02 = Execute({to_vectors02});
|
||||||
|
Status status02 = transform02(token01, &lookup_result);
|
||||||
|
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected02);
|
||||||
|
EXPECT_TRUE(status02.IsOk());
|
||||||
|
std::shared_ptr<TensorTransform> to_vectors03 = std::make_shared<text::ToVectors>(fast_text, unknown_init);
|
||||||
|
auto transform03 = Execute({to_vectors03});
|
||||||
|
Status status03 = transform03(token02, &lookup_result);
|
||||||
|
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected03);
|
||||||
|
EXPECT_TRUE(status03.IsOk());
|
||||||
|
std::shared_ptr<TensorTransform> to_vectors04 = std::make_shared<text::ToVectors>(fast_text, unknown_init, true);
|
||||||
|
auto transform04 = Execute({to_vectors04});
|
||||||
|
Status status04 = transform04(token03, &lookup_result);
|
||||||
|
EXPECT_MSTENSOR_EQ(lookup_result, ms_expected03);
|
||||||
|
EXPECT_TRUE(status04.IsOk());
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Feature: ToVectors
|
||||||
|
/// Description: test invalid parameter of ToVectors for FastText
|
||||||
|
/// Expectation: throw exception correctly
|
||||||
|
TEST_F(MindDataTestExecute, TestToVectorsWithInvalidParamForFastText) {
|
||||||
|
MS_LOG(INFO) << "Doing MindDataTestExecute-TestToVectorsWithInvalidParamForFastText.";
|
||||||
|
std::shared_ptr<Tensor> de_tensor;
|
||||||
|
Tensor::CreateScalar<std::string>("none", &de_tensor);
|
||||||
|
auto token = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_tensor));
|
||||||
|
mindspore::MSTensor lookup_result;
|
||||||
|
|
||||||
|
// Transform params.
|
||||||
|
std::string vectors_dir = "data/dataset/test_fast_text/fast_text.vec";
|
||||||
|
std::shared_ptr<FastText> fast_text01;
|
||||||
|
Status s = FastText::BuildFromFile(&fast_text01, vectors_dir);
|
||||||
|
EXPECT_EQ(s, Status::OK());
|
||||||
|
std::vector<float> unknown_init = {-1, -1, -1, -1};
|
||||||
|
std::shared_ptr<TensorTransform> to_vectors01 = std::make_shared<text::ToVectors>(fast_text01, unknown_init);
|
||||||
|
auto transform01 = Execute({to_vectors01});
|
||||||
|
Status status01 = transform01(token, &lookup_result);
|
||||||
|
EXPECT_FALSE(status01.IsOk());
|
||||||
|
std::shared_ptr<FastText> fast_text02 = nullptr;
|
||||||
|
std::shared_ptr<TensorTransform> to_vectors02 = std::make_shared<text::ToVectors>(fast_text02);
|
||||||
|
auto transform02 = Execute({to_vectors02});
|
||||||
|
Status status02 = transform02(token, &lookup_result);
|
||||||
|
EXPECT_FALSE(status02.IsOk());
|
||||||
|
}
|
||||||
|
|
||||||
// Feature: DBToAmplitude
|
// Feature: DBToAmplitude
|
||||||
// Description: test DBToAmplitude in eager mode
|
// Description: test DBToAmplitude in eager mode
|
||||||
// Expectation: the data is processed successfully
|
// Expectation: the data is processed successfully
|
||||||
|
|
|
@ -0,0 +1,7 @@
|
||||||
|
6 6
|
||||||
|
ok 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445718411
|
||||||
|
! 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709
|
||||||
|
this 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973
|
||||||
|
is 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603
|
||||||
|
my 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246
|
||||||
|
home 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923
|
|
@ -0,0 +1,7 @@
|
||||||
|
6 6
|
||||||
|
ok 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445718411
|
||||||
|
! 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709
|
||||||
|
this 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973
|
||||||
|
is 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603
|
||||||
|
my 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246
|
||||||
|
home 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923
|
|
@ -0,0 +1,7 @@
|
||||||
|
6 6
|
||||||
|
ok 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445718411
|
||||||
|
! 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709
|
||||||
|
this 0.15164 0.30177 -0.16763 0.17684 0.31719
|
||||||
|
is 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603
|
||||||
|
my 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246
|
||||||
|
home 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923
|
|
@ -0,0 +1,7 @@
|
||||||
|
the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445718411
|
||||||
|
, 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709
|
||||||
|
. 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973
|
||||||
|
6 6
|
||||||
|
of 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603
|
||||||
|
to 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246
|
||||||
|
and 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923
|
|
@ -0,0 +1,7 @@
|
||||||
|
ok
|
||||||
|
.
|
||||||
|
this
|
||||||
|
is
|
||||||
|
my
|
||||||
|
home
|
||||||
|
.
|
|
@ -0,0 +1,7 @@
|
||||||
|
ok
|
||||||
|
!
|
||||||
|
This
|
||||||
|
iS
|
||||||
|
my
|
||||||
|
HOME
|
||||||
|
.
|
|
@ -0,0 +1,237 @@
|
||||||
|
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ==============================================================================
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from mindspore import log
|
||||||
|
import mindspore.dataset as ds
|
||||||
|
import mindspore.dataset.text as text
|
||||||
|
import mindspore.dataset.text.transforms as T
|
||||||
|
|
||||||
|
DATASET_ROOT_PATH = "../data/dataset/test_fast_text/"
|
||||||
|
|
||||||
|
|
||||||
|
def test_fast_text_all_build_from_file_params():
|
||||||
|
"""
|
||||||
|
Feature: FastText
|
||||||
|
Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile
|
||||||
|
Expectation: output is equal to the expected value
|
||||||
|
"""
|
||||||
|
vectors = text.FastText.from_file(DATASET_ROOT_PATH + "fast_text.vec", max_vectors=100)
|
||||||
|
to_vectors = text.ToVectors(vectors)
|
||||||
|
data = ds.TextFileDataset(DATASET_ROOT_PATH + "words.txt", shuffle=False)
|
||||||
|
data = data.map(operations=to_vectors, input_columns=["text"])
|
||||||
|
ind = 0
|
||||||
|
res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
|
||||||
|
[0, 0, 0, 0, 0, 0],
|
||||||
|
[0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
|
||||||
|
[0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
|
||||||
|
[0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246],
|
||||||
|
[0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923],
|
||||||
|
[0, 0, 0, 0, 0, 0]]
|
||||||
|
print(data)
|
||||||
|
for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
|
||||||
|
res_array = np.array(res[ind], dtype=np.float32)
|
||||||
|
assert np.array_equal(res_array, d["text"]), ind
|
||||||
|
ind += 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_fast_text_all_build_from_file_params_eager():
|
||||||
|
"""
|
||||||
|
Feature: FastText
|
||||||
|
Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile in eager mode
|
||||||
|
Expectation: output is equal to the expected value
|
||||||
|
"""
|
||||||
|
vectors = text.FastText.from_file(DATASET_ROOT_PATH + "fast_text.vec", max_vectors=4)
|
||||||
|
to_vectors = T.ToVectors(vectors)
|
||||||
|
result1 = to_vectors("ok")
|
||||||
|
result2 = to_vectors("!")
|
||||||
|
result3 = to_vectors("this")
|
||||||
|
result4 = to_vectors("is")
|
||||||
|
result5 = to_vectors("my")
|
||||||
|
result6 = to_vectors("home")
|
||||||
|
result7 = to_vectors("none")
|
||||||
|
res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
|
||||||
|
[0.013441, 0.23682, -0.16899, 0.40951, 0.63812, 0.47709],
|
||||||
|
[0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
|
||||||
|
[0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
|
||||||
|
[0, 0, 0, 0, 0, 0],
|
||||||
|
[0, 0, 0, 0, 0, 0],
|
||||||
|
[0, 0, 0, 0, 0, 0]]
|
||||||
|
res_array = np.array(res, dtype=np.float32)
|
||||||
|
|
||||||
|
assert np.array_equal(result1, res_array[0])
|
||||||
|
assert np.array_equal(result2, res_array[1])
|
||||||
|
assert np.array_equal(result3, res_array[2])
|
||||||
|
assert np.array_equal(result4, res_array[3])
|
||||||
|
assert np.array_equal(result5, res_array[4])
|
||||||
|
assert np.array_equal(result6, res_array[5])
|
||||||
|
assert np.array_equal(result7, res_array[6])
|
||||||
|
|
||||||
|
|
||||||
|
def test_fast_text_all_to_vectors_params_eager():
|
||||||
|
"""
|
||||||
|
Feature: FastText
|
||||||
|
Description: test with all parameters which include `unk_init` and `lower_case_backup` in function ToVectors
|
||||||
|
in eager mode
|
||||||
|
Expectation: output is equal to the expected value
|
||||||
|
"""
|
||||||
|
vectors = text.FastText.from_file(DATASET_ROOT_PATH + "fast_text.vec", max_vectors=4)
|
||||||
|
my_unk = [-1, -1, -1, -1, -1, -1]
|
||||||
|
to_vectors = T.ToVectors(vectors, unk_init=my_unk, lower_case_backup=True)
|
||||||
|
result1 = to_vectors("Ok")
|
||||||
|
result2 = to_vectors("!")
|
||||||
|
result3 = to_vectors("This")
|
||||||
|
result4 = to_vectors("is")
|
||||||
|
result5 = to_vectors("my")
|
||||||
|
result6 = to_vectors("home")
|
||||||
|
result7 = to_vectors("none")
|
||||||
|
res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
|
||||||
|
[0.013441, 0.23682, -0.16899, 0.40951, 0.63812, 0.47709],
|
||||||
|
[0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
|
||||||
|
[0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
|
||||||
|
[-1, -1, -1, -1, -1, -1],
|
||||||
|
[-1, -1, -1, -1, -1, -1],
|
||||||
|
[-1, -1, -1, -1, -1, -1]]
|
||||||
|
res_array = np.array(res, dtype=np.float32)
|
||||||
|
|
||||||
|
assert np.array_equal(result1, res_array[0])
|
||||||
|
assert np.array_equal(result2, res_array[1])
|
||||||
|
assert np.array_equal(result3, res_array[2])
|
||||||
|
assert np.array_equal(result4, res_array[3])
|
||||||
|
assert np.array_equal(result5, res_array[4])
|
||||||
|
assert np.array_equal(result6, res_array[5])
|
||||||
|
assert np.array_equal(result7, res_array[6])
|
||||||
|
|
||||||
|
|
||||||
|
def test_fast_text_build_from_file():
|
||||||
|
"""
|
||||||
|
Feature: FastText
|
||||||
|
Description: test with only default parameter
|
||||||
|
Expectation: output is equal to the expected value
|
||||||
|
"""
|
||||||
|
vectors = text.FastText.from_file(DATASET_ROOT_PATH + "fast_text.vec")
|
||||||
|
to_vectors = text.ToVectors(vectors)
|
||||||
|
data = ds.TextFileDataset(DATASET_ROOT_PATH + "words.txt", shuffle=False)
|
||||||
|
data = data.map(operations=to_vectors, input_columns=["text"])
|
||||||
|
ind = 0
|
||||||
|
res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
|
||||||
|
[0, 0, 0, 0, 0, 0],
|
||||||
|
[0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
|
||||||
|
[0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
|
||||||
|
[0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246],
|
||||||
|
[0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923],
|
||||||
|
[0, 0, 0, 0, 0, 0]]
|
||||||
|
print(data)
|
||||||
|
for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
|
||||||
|
res_array = np.array(res[ind], dtype=np.float32)
|
||||||
|
assert np.array_equal(res_array, d["text"]), ind
|
||||||
|
ind += 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_fast_text_build_from_file_eager():
|
||||||
|
"""
|
||||||
|
Feature: FastText
|
||||||
|
Description: test with only default parameter in eager mode
|
||||||
|
Expectation: output is equal to the expected value
|
||||||
|
"""
|
||||||
|
vectors = text.FastText.from_file(DATASET_ROOT_PATH + "fast_text.vec")
|
||||||
|
to_vectors = T.ToVectors(vectors)
|
||||||
|
result1 = to_vectors("ok")
|
||||||
|
result2 = to_vectors("!")
|
||||||
|
result3 = to_vectors("this")
|
||||||
|
result4 = to_vectors("is")
|
||||||
|
result5 = to_vectors("my")
|
||||||
|
result6 = to_vectors("home")
|
||||||
|
result7 = to_vectors("none")
|
||||||
|
res = [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411],
|
||||||
|
[0.013441, 0.23682, -0.16899, 0.40951, 0.63812, 0.47709],
|
||||||
|
[0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973],
|
||||||
|
[0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603],
|
||||||
|
[0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246],
|
||||||
|
[0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923],
|
||||||
|
[0, 0, 0, 0, 0, 0]]
|
||||||
|
res_array = np.array(res, dtype=np.float32)
|
||||||
|
|
||||||
|
assert np.array_equal(result1, res_array[0])
|
||||||
|
assert np.array_equal(result2, res_array[1])
|
||||||
|
assert np.array_equal(result3, res_array[2])
|
||||||
|
assert np.array_equal(result4, res_array[3])
|
||||||
|
assert np.array_equal(result5, res_array[4])
|
||||||
|
assert np.array_equal(result6, res_array[5])
|
||||||
|
assert np.array_equal(result7, res_array[6])
|
||||||
|
|
||||||
|
|
||||||
|
def test_fast_text_invalid_input():
|
||||||
|
"""
|
||||||
|
Feature: FastText
|
||||||
|
Description: test the validate function with invalid parameters
|
||||||
|
Expectation: output is equal to the expected error
|
||||||
|
"""
|
||||||
|
def test_invalid_input(test_name, file_path, error, error_msg, max_vectors=None, unk_init=None,
|
||||||
|
lower_case_backup=False, token="ok"):
|
||||||
|
log.info("Test Vectors with wrong input: {0}".format(test_name))
|
||||||
|
with pytest.raises(error) as error_info:
|
||||||
|
vectors = text.FastText.from_file(file_path, max_vectors=max_vectors)
|
||||||
|
to_vectors = T.ToVectors(vectors, unk_init=unk_init, lower_case_backup=lower_case_backup)
|
||||||
|
to_vectors(token)
|
||||||
|
assert error_msg in str(error_info.value)
|
||||||
|
|
||||||
|
test_invalid_input("Not all vectors have the same number of dimensions",
|
||||||
|
DATASET_ROOT_PATH + "fast_text_dim_different.vec", error=RuntimeError,
|
||||||
|
error_msg="all vectors must have the same number of dimensions, " \
|
||||||
|
"but got dim 5 while expecting 6")
|
||||||
|
test_invalid_input("the file is empty.", DATASET_ROOT_PATH + "fast_text_empty.vec",
|
||||||
|
error=RuntimeError, error_msg="invalid file, file is empty.")
|
||||||
|
test_invalid_input("the count of `unknown_init`'s element is different with word vector.",
|
||||||
|
DATASET_ROOT_PATH + "fast_text.vec",
|
||||||
|
error=RuntimeError,
|
||||||
|
error_msg="unk_init must be the same length as vectors, but got unk_init",
|
||||||
|
unk_init=[-1, -1])
|
||||||
|
test_invalid_input("The file not exist", DATASET_ROOT_PATH + "not_exist.vec", RuntimeError,
|
||||||
|
error_msg="FastText: invalid file")
|
||||||
|
test_invalid_input("The token is 1-dimensional", DATASET_ROOT_PATH + "fast_text_with_wrong_info.vec",
|
||||||
|
error=RuntimeError, error_msg="token with 1-dimensional vector.")
|
||||||
|
test_invalid_input("max_vectors parameter must be greater than 0", DATASET_ROOT_PATH + "fast_text.vec",
|
||||||
|
error=ValueError, error_msg="Input max_vectors is not within the required interval",
|
||||||
|
max_vectors=-1)
|
||||||
|
test_invalid_input("invalid max_vectors parameter type as a float", DATASET_ROOT_PATH + "fast_text.vec",
|
||||||
|
error=TypeError, error_msg="Argument max_vectors with value 1.0 is not of type [<class 'int'>],"
|
||||||
|
" but got <class 'float'>.", max_vectors=1.0)
|
||||||
|
test_invalid_input("invalid max_vectors parameter type as a string", DATASET_ROOT_PATH + "fast_text.vec",
|
||||||
|
error=TypeError, error_msg="Argument max_vectors with value 1 is not of type [<class 'int'>],"
|
||||||
|
" but got <class 'str'>.", max_vectors="1")
|
||||||
|
test_invalid_input("invalid token parameter type as a float", DATASET_ROOT_PATH + "fast_text.vec",
|
||||||
|
error=RuntimeError, error_msg="input tensor type should be string.", token=1.0)
|
||||||
|
test_invalid_input("invalid lower_case_backup parameter type as a string", DATASET_ROOT_PATH + "fast_text.vec",
|
||||||
|
error=TypeError, error_msg="Argument lower_case_backup with value True is " \
|
||||||
|
"not of type [<class 'bool'>],"
|
||||||
|
" but got <class 'str'>.", lower_case_backup="True")
|
||||||
|
test_invalid_input("invalid lower_case_backup parameter type as a string", DATASET_ROOT_PATH + "fast_text.vec",
|
||||||
|
error=TypeError, error_msg="Argument lower_case_backup with value True is " \
|
||||||
|
"not of type [<class 'bool'>],"
|
||||||
|
" but got <class 'str'>.", lower_case_backup="True")
|
||||||
|
test_invalid_input("the suffix of pre-training set must be `*.vec`", DATASET_ROOT_PATH + "fast_text.txt",
|
||||||
|
error=RuntimeError, error_msg="FastText: invalid file, can not find file '*.vec'")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
test_fast_text_all_build_from_file_params()
|
||||||
|
test_fast_text_all_build_from_file_params_eager()
|
||||||
|
test_fast_text_all_to_vectors_params_eager()
|
||||||
|
test_fast_text_build_from_file()
|
||||||
|
test_fast_text_build_from_file_eager()
|
||||||
|
test_fast_text_invalid_input()
|
Loading…
Reference in New Issue