!33475 Remove test data containing sensitive word

Merge pull request !33475 from xiaotianci/fix_examples
This commit is contained in:
i-robot 2022-04-28 02:05:09 +00:00 committed by Gitee
commit fd22d69ca7
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
11 changed files with 203 additions and 34842 deletions

View File

@ -192,7 +192,7 @@ class SentencePieceVocab {
/// \par Example /// \par Example
/// \code /// \code
/// std::string dataset_path; /// std::string dataset_path;
/// dataset_path = datasets_root_path_ + "/test_sentencepiece/botchan.txt"; /// dataset_path = datasets_root_path_ + "/test_sentencepiece/vocab.txt";
/// std::vector<std::string> path_list; /// std::vector<std::string> path_list;
/// path_list.emplace_back(dataset_path); /// path_list.emplace_back(dataset_path);
/// std::unordered_map<std::string, std::string> param_map; /// std::unordered_map<std::string, std::string> param_map;

View File

@ -38,13 +38,13 @@ TEST_F(MindDataTestPipeline, TestSentencePieceVocabSuccess1) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSentencePieceVocabSuccess1 plus sentencepiece tokenizer."; MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSentencePieceVocabSuccess1 plus sentencepiece tokenizer.";
// Create a TextFile dataset // Create a TextFile dataset
std::string vocab_file = datasets_root_path_ + "/test_sentencepiece/botchan.txt"; std::string vocab_file = datasets_root_path_ + "/test_sentencepiece/vocab.txt";
std::shared_ptr<Dataset> ds_vocab = TextFile({vocab_file}, 0, ShuffleMode::kFalse); std::shared_ptr<Dataset> ds_vocab = TextFile({vocab_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds_vocab, nullptr); EXPECT_NE(ds_vocab, nullptr);
// Create vocab from dataset // Create vocab from dataset
std::shared_ptr<SentencePieceVocab> vocab = std::shared_ptr<SentencePieceVocab> vocab =
ds_vocab->BuildSentencePieceVocab({}, 5000, 0.9995, SentencePieceModel::kUnigram, {}); ds_vocab->BuildSentencePieceVocab({}, 100, 0.9995, SentencePieceModel::kUnigram, {});
EXPECT_NE(vocab, nullptr); EXPECT_NE(vocab, nullptr);
// Create a TextFile dataset // Create a TextFile dataset
@ -70,13 +70,13 @@ TEST_F(MindDataTestPipeline, TestSentencePieceVocabSuccess1) {
ASSERT_OK(iter->GetNextRow(&row)); ASSERT_OK(iter->GetNextRow(&row));
// Expected result after tokenization // Expected result after tokenization
std::vector<std::string> expected = {"▁I", "▁sa", "w", "▁a", "▁girl", "▁with", "▁a", "▁te", "les", "co", "pe", "."}; std::vector<std::string> expected = {"", "I", "▁use", "▁MindSpore", "", "to", "", "t", "r",
"a", "i", "n", "", "m", "y", "▁model", "."};
std::shared_ptr<Tensor> de_expected_tensor; std::shared_ptr<Tensor> de_expected_tensor;
ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor)); ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
mindspore::MSTensor expected_tensor = auto expected_tensor = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
uint64_t i = 0; uint32_t i = 0;
while (row.size() != 0) { while (row.size() != 0) {
auto txt = row["text"]; auto txt = row["text"];
TEST_MS_LOG_MSTENSOR(INFO, "txt: ", txt); TEST_MS_LOG_MSTENSOR(INFO, "txt: ", txt);
@ -97,13 +97,13 @@ TEST_F(MindDataTestPipeline, TestSentencePieceVocabSuccess2) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSentencePieceVocabSuccess2 plus sentencepiece tokenizer."; MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSentencePieceVocabSuccess2 plus sentencepiece tokenizer.";
// Create a TextFile dataset // Create a TextFile dataset
std::string vocab_file = datasets_root_path_ + "/test_sentencepiece/botchan.txt"; std::string vocab_file = datasets_root_path_ + "/test_sentencepiece/vocab.txt";
std::shared_ptr<Dataset> ds_vocab = TextFile({vocab_file}, 0, ShuffleMode::kFalse); std::shared_ptr<Dataset> ds_vocab = TextFile({vocab_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds_vocab, nullptr); EXPECT_NE(ds_vocab, nullptr);
// Create vocab from dataset // Create vocab from dataset
std::shared_ptr<SentencePieceVocab> vocab = std::shared_ptr<SentencePieceVocab> vocab =
ds_vocab->BuildSentencePieceVocab({}, 5000, 0.9995, SentencePieceModel::kUnigram, {}); ds_vocab->BuildSentencePieceVocab({}, 100, 0.9995, SentencePieceModel::kUnigram, {});
EXPECT_NE(vocab, nullptr); EXPECT_NE(vocab, nullptr);
// Save vocab model to local // Save vocab model to local
@ -133,13 +133,13 @@ TEST_F(MindDataTestPipeline, TestSentencePieceVocabSuccess2) {
ASSERT_OK(iter->GetNextRow(&row)); ASSERT_OK(iter->GetNextRow(&row));
// Expected result after tokenization // Expected result after tokenization
std::vector<std::string> expected = {"▁I", "▁sa", "w", "▁a", "▁girl", "▁with", "▁a", "▁te", "les", "co", "pe", "."}; std::vector<std::string> expected = {"", "I", "▁use", "▁MindSpore", "", "to", "", "t", "r",
"a", "i", "n", "", "m", "y", "▁model", "."};
std::shared_ptr<Tensor> de_expected_tensor; std::shared_ptr<Tensor> de_expected_tensor;
ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor)); ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
mindspore::MSTensor expected_tensor = auto expected_tensor = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
uint64_t i = 0; uint32_t i = 0;
while (row.size() != 0) { while (row.size() != 0) {
auto txt = row["text"]; auto txt = row["text"];
TEST_MS_LOG_MSTENSOR(INFO, "txt: ", txt); TEST_MS_LOG_MSTENSOR(INFO, "txt: ", txt);
@ -160,7 +160,7 @@ TEST_F(MindDataTestPipeline, TestSentencePieceVocabFail) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSentencePieceVocabFail1 with incorrect parameter."; MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSentencePieceVocabFail1 with incorrect parameter.";
// Create a TextFile dataset // Create a TextFile dataset
std::string vocab_file = datasets_root_path_ + "/test_sentencepiece/botchan.txt"; std::string vocab_file = datasets_root_path_ + "/test_sentencepiece/vocab.txt";
std::shared_ptr<Dataset> ds_vocab = TextFile({vocab_file}, 0, ShuffleMode::kFalse); std::shared_ptr<Dataset> ds_vocab = TextFile({vocab_file}, 0, ShuffleMode::kFalse);
EXPECT_NE(ds_vocab, nullptr); EXPECT_NE(ds_vocab, nullptr);

View File

@ -18,11 +18,9 @@
#include "minddata/dataset/core/de_tensor.h" #include "minddata/dataset/core/de_tensor.h"
#include "minddata/dataset/include/dataset/audio.h" #include "minddata/dataset/include/dataset/audio.h"
#include "minddata/dataset/include/dataset/execute.h" #include "minddata/dataset/include/dataset/execute.h"
#include "minddata/dataset/include/dataset/transforms.h"
#include "minddata/dataset/include/dataset/audio.h"
#include "minddata/dataset/include/dataset/vision.h"
#include "minddata/dataset/include/dataset/audio.h"
#include "minddata/dataset/include/dataset/text.h" #include "minddata/dataset/include/dataset/text.h"
#include "minddata/dataset/include/dataset/transforms.h"
#include "minddata/dataset/include/dataset/vision.h"
#include "minddata/dataset/text/char_n_gram.h" #include "minddata/dataset/text/char_n_gram.h"
#include "minddata/dataset/text/fast_text.h" #include "minddata/dataset/text/fast_text.h"
#include "minddata/dataset/text/glove.h" #include "minddata/dataset/text/glove.h"

View File

@ -16,6 +16,7 @@
#include <string> #include <string>
#include <string_view> #include <string_view>
#include "common/common.h" #include "common/common.h"
#include "minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h" #include "minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h"
#include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h" #include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
@ -60,11 +61,11 @@ TEST_F(MindDataTestSentencePieceVocabOp, TestSentencePieceFromFileFuntions) {
MS_LOG(INFO) << "Doing MindDataTestSentencePieceVocabOp TestSentencePieceFromFileFuntions."; MS_LOG(INFO) << "Doing MindDataTestSentencePieceVocabOp TestSentencePieceFromFileFuntions.";
std::string dataset_path; std::string dataset_path;
dataset_path = datasets_root_path_ + "/test_sentencepiece/botchan.txt"; dataset_path = datasets_root_path_ + "/test_sentencepiece/vocab.txt";
std::vector<std::string> path_list; std::vector<std::string> path_list;
path_list.emplace_back(dataset_path); path_list.emplace_back(dataset_path);
std::unordered_map<std::string, std::string> param_map; std::unordered_map<std::string, std::string> param_map;
std::shared_ptr<SentencePieceVocab> spm = std::make_unique<SentencePieceVocab>(); std::shared_ptr<SentencePieceVocab> spm = std::make_unique<SentencePieceVocab>();
Status rc = SentencePieceVocab::BuildFromFile(path_list, 5000, 0.9995, SentencePieceModel::kUnigram, param_map, &spm); Status rc = SentencePieceVocab::BuildFromFile(path_list, 100, 0.9995, SentencePieceModel::kUnigram, param_map, &spm);
ASSERT_TRUE(rc.IsOk()); ASSERT_TRUE(rc.IsOk());
} }

View File

@ -1,3 +1,3 @@
{"label": "102", "label_desc": "news_entertainment", "sentence": "新闻1", "keywords": "关键词一,关键词二,关键词三,关键词四"} {"label": "102", "label_desc": "音乐", "sentence": "新闻1", "keywords": "关键词一,关键词二,关键词三,关键词四"}
{"label": "110", "label_desc": "news_military", "sentence": "新闻2", "keywords": "关键词一,关键词二,关键词三,关键词四,关键词五"} {"label": "110", "label_desc": "电影", "sentence": "新闻2", "keywords": "关键词一,关键词二,关键词三,关键词四,关键词五"}
{"label": "104", "label_desc": "news_finance", "sentence": "新闻3", "keywords": "关键词一,关键词二,关键词三,关键词四,关键词五"} {"label": "104", "label_desc": "科技", "sentence": "新闻3", "keywords": "关键词一,关键词二,关键词三,关键词四,关键词五"}

View File

@ -1 +1 @@
I saw a girl with a telescope. I use MindSpore to train my model.

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,89 @@
Overall Architecture
MindSpore is a deep learning framework in all scenarios, aiming to achieve easy development, efficient execution, and
all-scenario coverage. Easy development features include API friendliness and low debugging difficulty. Efficient
execution includes computing efficiency, data preprocessing efficiency, and distributed training efficiency.
All-scenario coverage means that the framework supports cloud, edge, and device scenarios.
ME (MindExpression) provides user-level APIs for scientific computing, building and training neural networks, and
converting Python code of users into graphs. For more information about the overall architecture, see Overall
Architecture.
Design Concept
MindSpore originates from the best practices of the entire industry and provides unified model training, inference, and
export APIs for data scientists and algorithm engineers. It supports flexible deployment in different scenarios such as
the device, edge, and cloud, and promotes the prosperity of domains such as deep learning and scientific computing.
MindSpore provides the Python programming paradigm. Users can use the native control logic of Python to build complex
neural network models, simplifying AI programming. For details, see Quick Start for Beginners.
Currently, there are two execution modes of a mainstream deep learning framework: a static graph mode and a dynamic
graph mode. The static graph mode has a relatively high training performance, but is difficult to debug. On the
contrary, the dynamic graph mode is easy to debug, but is difficult to execute efficiently. MindSpore provides an
encoding mode that unifies dynamic and static graphs, which greatly improves the compatibility between static and
dynamic graphs. Instead of developing multiple sets of code, users can switch between the dynamic and static graph
modes by changing only one line of code. For example, set context.set_context(mode=context.PYNATIVE_MODE) to switch to
the dynamic graph mode, or set context.set_context(mode=context.GRAPH_MODE) to switch to the static graph mode, which
facilitates development and debugging, and improves performance experience.
A neural network model is usually trained based on gradient descent algorithm, but the manual derivation process is
complex and the result is prone to errors. The automatic differentiation mechanism of MindSpore based on source code
transformation (SCT) uses a functional differential programming architecture and provides Python APIs at the API layer,
including the expression of control flows. Users can focus on the native mathematical expression of the model algorithm
without manual derivation. The sample code for automatic differentiation is as follows:
In the first step, a function (computational graph) is defined. In the second step, automatic differentiation is
performed by using a backward API provided by MindSpore, and the first derivative function (computational graph) is
defined. In the third step, the second derivative function (computational graph) is defined. After the input is given,
the second derivative of the function defined in step 1 can be obtained at the specified position. The result of the
second derivative is 12.
In addition, the SCT can convert Python code into an intermediate representation (IR) of a MindSpore function. The IR
constructs a computational graph that can be parsed and executed on different devices. Before the computational graph
is executed, a plurality of software and hardware collaborative optimization technologies are used, and performance and
efficiency in different scenarios such as device, edge, and cloud, are improved.
Improving the data processing capability to match the computing power of AI chips is the key to ensure the ultimate
performance of AI chips. MindSpore provides multiple data processing operators and uses automatic data acceleration
technology to implement high-performance pipelines, including data loading, data demonstration, and data conversion. It
supports data processing capabilities in all scenarios, such as CV, NLP, and GNN. MindRecord is a self-developed data
format of MindSpore. It features efficient read and write and easy distributed processing. Users can convert
non-standard and common datasets to the MindRecord format to obtain better performance experience. For details about
the conversion, see MindSpore Data Format Conversion. MindSpore supports the loading of common datasets and datasets in
multiple data storage formats. For example, users can use dataset=dataset.Cifar10Dataset("Cifar10Data/") to load the
CIFAR-10 dataset. Cifar10Data/ indicates the local directory of the dataset, and users can also use GeneratorDataset to
customize the dataset loading mode. Data augmentation is a method of generating new data based on (limited) data, which
can reduce the overfitting phenomenon of network model and improve the generalization ability of the model. In addition
to user-defined data augmentation, MindSpore provides automatic data augmentation, making data augmentation more
flexible. For details, see Automatic Data Augmentation.
The deep learning neural network model usually contains many hidden layers for feature extraction. However, the feature
extraction is random and the debugging process is invisible, which limits the trustworthiness and optimization of the
deep learning technology. MindSpore supports visualized debugging and optimization (MindInsight) and provides functions
such as training dashboard, lineage, performance analysis, and debugger to help users detect deviations during model
training and easily debug and optimize models. For example, before initializing the network, users can use
profiler=Profiler() to initialize the Profiler object, automatically collect information such as the operator time
consumption during training, and record the information in a file. After the training is complete, call
profiler.analyse() to stop collecting data and generate performance analysis results. Users can view and analyze the
visualized results to more efficiently debug network performance. For details about debugging and optimization, see
Training Process Visualization.
As a scale of neural network models and datasets continuously increases, parallel distributed training becomes a common
practice of neural network training. However, policy selection and compilation of parallel distributed training are
very complex, which severely restricts training efficiency of a deep learning model and hinders development of deep
learning. MindSpore unifies the encoding methods of standalone and distributed training. Developers do not need to
write complex distributed policies. Instead, they can implement distributed training by adding a small amount of codes
to the standalone code. For example, after context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL)
is set, a cost model can be automatically established, and a better parallel mode can be selected for users. This
improves the training efficiency of neural networks, greatly decreases the AI development difficulty, and enables users
to quickly implement model. For more information, see Distributed Training.
Level Structure
To support network building, entire graph execution, subgraph execution, and single-operator execution, MindSpore
provides users with three levels of APIs which are Low-Level Python API, Medium-Level Python API, and High-Level Python
API in ascending order.
Low-Level Python API
The first is low-level API, including tensor definition, basic operators, and automatic differential modules. Users can
use the low-level API to easily define tensors and perform derivative calculation. For example, users can customize
tensors by using the Tensor API, and use the GradOperation operator in the ops.composite module to calculate the
derivative of the function at a specified position.
Medium-Level Python API
The second is medium-level API which encapsulates low-cost APIs and provides modules such as the network layer,
optimizer, and loss function. Users can flexibly build neural networks and control execution processes through the
medium-level API to quickly implement model algorithm logic. For example, users can call the Cell API to build neural
network models and computing logic, add the loss function and optimization methods to the neural network model by using
the loss module and Optimizer API, and use the dataset module to process data for model training and evaluation.
High-Level Python API
The third is high-level API. Based on the medium-level API, it provides advanced APIs such as training and inference
management, mixed precision training, and debugging and optimization, facilitating users to control the execution
process of the entire network and implement training, inference, and optimization of the neural network. For example,
users can use the Model API, specify the neural network model to be trained and related training settings, train the
neural network model, and debug the neural network performance through the Profiler API.

View File

@ -1 +1 @@
Homelessness (or Houselessness as George Carlin stated) has been an issue for years but never a plan to help those on the street that were once considered human who did everything from going to school, work, or vote for the matter. Most people think of the homeless as just a lost cause while worrying about things such as racism, the war on Iraq, pressuring kids to succeed, technology, the elections, inflation, or worrying if they'll be next to end up on the streets.<br /><br />But what if you were given a bet to live on the streets for a month without the luxuries you once had from a home, the entertainment sets, a bathroom, pictures on the wall, a computer, and everything you once treasure to see what it's like to be homeless? That is Goddard Bolt's lesson.<br /><br />Mel Brooks (who directs) who stars as Bolt plays a rich man who has everything in the world until deciding to make a bet with a sissy rival (Jeffery Tambor) to see if he can live in the streets for thirty days without the luxuries; if Bolt succeeds, he can do what he wants with a future project of making more buildings. The bet's on where Bolt is thrown on the street with a bracelet on his leg to monitor his every move where he can't step off the sidewalk. He's given the nickname Pepto by a vagrant after it's written on his forehead where Bolt meets other characters including a woman by the name of Molly (Lesley Ann Warren) an ex-dancer who got divorce before losing her home, and her pals Sailor (Howard Morris) and Fumes (Teddy Wilson) who are already used to the streets. They're survivors. Bolt isn't. He's not used to reaching mutual agreements like he once did when being rich where it's fight or flight, kill or be killed.<br /><br />While the love connection between Molly and Bolt wasn't necessary to plot, I found "Life Stinks" to be one of Mel Brooks' observant films where prior to being a comedy, it shows a tender side compared to his slapstick work such as Blazing Saddles, Young Frankenstein, or Spaceballs for the matter, to show what it's like having something valuable before losing it the next day or on the other hand making a stupid bet like all rich people do when they don't know what to do with their money. Maybe they should give it to the homeless instead of using it like Monopoly money.<br /><br />Or maybe this film will inspire you to help others. MindSpore is a deep learning framework in all scenarios, aiming to achieve easy development, efficient execution, and all-scenario coverage.<br /><br />Easy development features include API friendliness and low debugging difficulty.<br /><br />Efficient execution includes computing efficiency, data preprocessing efficiency, and distributed training efficiency.<br /><br />All-scenario coverage means that the framework supports cloud, edge, and device scenarios.<br /><br />ME (MindExpression) provides user-level APIs for scientific computing, building and training neural networks, and converting Python code of users into graphs.<br /><br />For more information about the overall architecture, see Overall Architecture.

File diff suppressed because it is too large Load Diff

View File

@ -18,60 +18,86 @@ import mindspore.dataset.text as text
import mindspore.dataset as ds import mindspore.dataset as ds
from mindspore.dataset.text import SentencePieceModel, to_str, SPieceTokenizerOutType from mindspore.dataset.text import SentencePieceModel, to_str, SPieceTokenizerOutType
VOCAB_FILE = "../data/dataset/test_sentencepiece/botchan.txt" VOCAB_FILE = "../data/dataset/test_sentencepiece/vocab.txt"
DATA_FILE = "../data/dataset/testTokenizerData/sentencepiece_tokenizer.txt" DATA_FILE = "../data/dataset/testTokenizerData/sentencepiece_tokenizer.txt"
def test_sentence_piece_tokenizer_callable(): def test_sentence_piece_tokenizer_callable():
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {}) """
Feature: SentencePieceTokenizer
Description: test SentencePieceTokenizer with eager mode
Expectation: output is equal to the expected value
"""
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 100, 0.9995, SentencePieceModel.UNIGRAM, {})
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
data = '123' data = "123"
assert np.array_equal(tokenizer(data), ['', '12', '3']) assert np.array_equal(tokenizer(data), ["", "1", "23"])
def test_from_vocab_to_str_UNIGRAM(): def test_from_vocab_to_str_unigram():
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {}) """
Feature: SentencePieceTokenizer
Description: test SentencePieceTokenizer with UNIGRAM model
Expectation: output is equal to the expected value
"""
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 100, 0.9995, SentencePieceModel.UNIGRAM, {})
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
dataset = dataset.map(operations=tokenizer) dataset = dataset.map(operations=tokenizer)
expect = ['▁I', '▁sa', 'w', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'co', 'pe', '.'] expect = ["", "I", "▁use", "▁MindSpore", "", "to", "", "t", "r", "a", "i", "n", "", "m", "y", "▁model", "."]
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
ret = to_str(i["text"]) ret = to_str(i["text"])
for key, value in enumerate(ret): for key, value in enumerate(ret):
assert value == expect[key] assert value == expect[key]
def test_from_vocab_to_str_BPE(): def test_from_vocab_to_str_bpe():
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.BPE, {}) """
Feature: SentencePieceTokenizer
Description: test SentencePieceTokenizer with BPE model
Expectation: output is equal to the expected value
"""
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 100, 0.9995, SentencePieceModel.BPE, {})
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
dataset = dataset.map(operations=tokenizer) dataset = dataset.map(operations=tokenizer)
expect = ['▁I', '▁saw', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'c', 'ope', '.'] expect = ["", "I", "", "u", "s", "e", "", "M", "in", "d", "S", "p", "or", "e", "▁t", "o", "▁t", "ra", "in", "▁m",
"y", "▁m", "ode", "l", "."]
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
ret = to_str(i["text"]) ret = to_str(i["text"])
for key, value in enumerate(ret): for key, value in enumerate(ret):
assert value == expect[key] assert value == expect[key]
def test_from_vocab_to_str_CHAR(): def test_from_vocab_to_str_char():
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.CHAR, {}) """
Feature: SentencePieceTokenizer
Description: test SentencePieceTokenizer with CHAR model
Expectation: output is equal to the expected value
"""
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 100, 0.9995, SentencePieceModel.CHAR, {})
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
dataset = dataset.map(operations=tokenizer) dataset = dataset.map(operations=tokenizer)
expect = ['', 'I', '', 's', 'a', 'w', '', 'a', '', 'g', 'i', 'r', 'l', '', 'w', 'i', 't', 'h',\ expect = ["", "I", "", "u", "s", "e", "", "M", "i", "n", "d", "S", "p", "o", "r", "e", "", "t", "o", "", "t",
'', 'a', '', 't', 'e', 'l', 'e', 's', 'c', 'o', 'p', 'e', '.'] "r", "a", "i", "n", "", "m", "y", "", "m", "o", "d", "e", "l", "."]
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
ret = to_str(i["text"]) ret = to_str(i["text"])
for key, value in enumerate(ret): for key, value in enumerate(ret):
assert value == expect[key] assert value == expect[key]
def test_from_vocab_to_str_WORD(): def test_from_vocab_to_str_word():
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.WORD, {}) """
Feature: SentencePieceTokenizer
Description: test SentencePieceTokenizer with WORD model
Expectation: output is equal to the expected value
"""
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 100, 0.9995, SentencePieceModel.WORD, {})
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
dataset = dataset.map(operations=tokenizer) dataset = dataset.map(operations=tokenizer)
expect = ['▁I', '▁saw', '▁a', '▁girl', '▁with', '▁a', '▁telescope.'] expect = ["▁I", "▁use", "▁MindSpore", "▁to", "▁train▁my▁model."]
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
ret = to_str(i["text"]) ret = to_str(i["text"])
for key, value in enumerate(ret): for key, value in enumerate(ret):
@ -79,11 +105,16 @@ def test_from_vocab_to_str_WORD():
def test_from_vocab_to_int(): def test_from_vocab_to_int():
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {}) """
Feature: SentencePieceTokenizer
Description: test SentencePieceTokenizer with out_type equal to int
Expectation: output is equal to the expected value
"""
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 100, 0.9995, SentencePieceModel.UNIGRAM, {})
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.INT) tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.INT)
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
dataset = dataset.map(operations=tokenizer) dataset = dataset.map(operations=tokenizer)
expect = [6, 329, 183, 8, 945, 23, 8, 3783, 4382, 4641, 1405, 4] expect = [3, 41, 59, 53, 3, 29, 3, 6, 12, 99, 7, 10, 3, 11, 20, 45, 19]
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
ret = i["text"] ret = i["text"]
for key, value in enumerate(ret): for key, value in enumerate(ret):
@ -91,12 +122,17 @@ def test_from_vocab_to_int():
def test_from_file_to_str(): def test_from_file_to_str():
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {}) """
Feature: SentencePieceTokenizer
Description: test SentencePieceTokenizer with out_type equal to string
Expectation: output is equal to the expected value
"""
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 100, 0.9995, SentencePieceModel.UNIGRAM, {})
text.SentencePieceVocab.save_model(vocab, "./", "m.model") text.SentencePieceVocab.save_model(vocab, "./", "m.model")
tokenizer = text.SentencePieceTokenizer("./m.model", out_type=SPieceTokenizerOutType.STRING) tokenizer = text.SentencePieceTokenizer("./m.model", out_type=SPieceTokenizerOutType.STRING)
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
dataset = dataset.map(operations=tokenizer) dataset = dataset.map(operations=tokenizer)
expect = ['▁I', '▁sa', 'w', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'co', 'pe', '.'] expect = ["", "I", "▁use", "▁MindSpore", "", "to", "", "t", "r", "a", "i", "n", "", "m", "y", "▁model", "."]
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
ret = to_str(i["text"]) ret = to_str(i["text"])
for key, value in enumerate(ret): for key, value in enumerate(ret):
@ -104,12 +140,17 @@ def test_from_file_to_str():
def test_from_file_to_int(): def test_from_file_to_int():
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {}) """
Feature: SentencePieceTokenizer
Description: test SentencePieceTokenizer while loading vocab model from file
Expectation: output is equal to the expected value
"""
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 100, 0.9995, SentencePieceModel.UNIGRAM, {})
text.SentencePieceVocab.save_model(vocab, "./", "m.model") text.SentencePieceVocab.save_model(vocab, "./", "m.model")
tokenizer = text.SentencePieceTokenizer("./m.model", out_type=SPieceTokenizerOutType.INT) tokenizer = text.SentencePieceTokenizer("./m.model", out_type=SPieceTokenizerOutType.INT)
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
dataset = dataset.map(operations=tokenizer) dataset = dataset.map(operations=tokenizer)
expect = [6, 329, 183, 8, 945, 23, 8, 3783, 4382, 4641, 1405, 4] expect = [3, 41, 59, 53, 3, 29, 3, 6, 12, 99, 7, 10, 3, 11, 20, 45, 19]
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
ret = i["text"] ret = i["text"]
for key, value in enumerate(ret): for key, value in enumerate(ret):
@ -117,12 +158,17 @@ def test_from_file_to_int():
def test_build_from_dataset(): def test_build_from_dataset():
"""
Feature: SentencePieceTokenizer
Description: test SentencePieceTokenizer while loading vocab model from dataset
Expectation: output is equal to the expected value
"""
data = ds.TextFileDataset(VOCAB_FILE, shuffle=False) data = ds.TextFileDataset(VOCAB_FILE, shuffle=False)
vocab = text.SentencePieceVocab.from_dataset(data, ["text"], 5000, 0.9995, SentencePieceModel.UNIGRAM, {}) vocab = text.SentencePieceVocab.from_dataset(data, ["text"], 100, 0.9995, SentencePieceModel.UNIGRAM, {})
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
dataset = dataset.map(operations=tokenizer) dataset = dataset.map(operations=tokenizer)
expect = ['▁I', '▁sa', 'w', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'co', 'pe', '.'] expect = ["", "I", "▁use", "▁MindSpore", "", "to", "", "t", "r", "a", "i", "n", "", "m", "y", "▁model", "."]
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
ret = to_str(i["text"]) ret = to_str(i["text"])
for key, value in enumerate(ret): for key, value in enumerate(ret):
@ -130,8 +176,8 @@ def test_build_from_dataset():
def apply_func(dataset): def apply_func(dataset):
input_columns = ['text'] input_columns = ["text"]
output_columns = ['text2'] output_columns = ["text2"]
dataset = dataset.rename(input_columns, output_columns) dataset = dataset.rename(input_columns, output_columns)
return dataset return dataset
@ -141,7 +187,7 @@ def zip_test(dataset):
dataset_2 = copy.deepcopy(dataset) dataset_2 = copy.deepcopy(dataset)
dataset_1 = dataset_1.apply(apply_func) dataset_1 = dataset_1.apply(apply_func)
dataset_zip = ds.zip((dataset_1, dataset_2)) dataset_zip = ds.zip((dataset_1, dataset_2))
expect = ['▁I', '▁sa', 'w', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'co', 'pe', '.'] expect = ["", "I", "▁use", "▁MindSpore", "", "to", "", "t", "r", "a", "i", "n", "", "m", "y", "▁model", "."]
for i in dataset_zip.create_dict_iterator(num_epochs=1, output_numpy=True): for i in dataset_zip.create_dict_iterator(num_epochs=1, output_numpy=True):
ret = to_str(i["text"]) ret = to_str(i["text"])
for key, value in enumerate(ret): for key, value in enumerate(ret):
@ -151,15 +197,21 @@ def zip_test(dataset):
def concat_test(dataset): def concat_test(dataset):
dataset_1 = copy.deepcopy(dataset) dataset_1 = copy.deepcopy(dataset)
dataset = dataset.concat(dataset_1) dataset = dataset.concat(dataset_1)
expect = ['▁I', '▁sa', 'w', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'co', 'pe', '.'] expect = ["", "I", "▁use", "▁MindSpore", "", "to", "", "t", "r", "a", "i", "n", "", "m", "y", "▁model", "."]
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
ret = to_str(i["text"]) ret = to_str(i["text"])
for key, value in enumerate(ret): for key, value in enumerate(ret):
assert value == expect[key] assert value == expect[key]
def test_with_zip_concat(): def test_with_zip_concat():
"""
Feature: SentencePieceTokenizer
Description: test SentencePieceTokenizer with zip and concat operations
Expectation: output is equal to the expected value
"""
data = ds.TextFileDataset(VOCAB_FILE, shuffle=False) data = ds.TextFileDataset(VOCAB_FILE, shuffle=False)
vocab = text.SentencePieceVocab.from_dataset(data, ["text"], 5000, 0.9995, SentencePieceModel.UNIGRAM, {}) vocab = text.SentencePieceVocab.from_dataset(data, ["text"], 100, 0.9995, SentencePieceModel.UNIGRAM, {})
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
dataset = dataset.map(operations=tokenizer, num_parallel_workers=2) dataset = dataset.map(operations=tokenizer, num_parallel_workers=2)
@ -169,10 +221,10 @@ def test_with_zip_concat():
if __name__ == "__main__": if __name__ == "__main__":
test_sentence_piece_tokenizer_callable() test_sentence_piece_tokenizer_callable()
test_from_vocab_to_str_UNIGRAM() test_from_vocab_to_str_unigram()
test_from_vocab_to_str_BPE() test_from_vocab_to_str_bpe()
test_from_vocab_to_str_CHAR() test_from_vocab_to_str_char()
test_from_vocab_to_str_WORD() test_from_vocab_to_str_word()
test_from_vocab_to_int() test_from_vocab_to_int()
test_from_file_to_str() test_from_file_to_str()
test_from_file_to_int() test_from_file_to_int()