!33475 Remove test data containing sensitive word
Merge pull request !33475 from xiaotianci/fix_examples
This commit is contained in:
commit
fd22d69ca7
|
@ -192,7 +192,7 @@ class SentencePieceVocab {
|
||||||
/// \par Example
|
/// \par Example
|
||||||
/// \code
|
/// \code
|
||||||
/// std::string dataset_path;
|
/// std::string dataset_path;
|
||||||
/// dataset_path = datasets_root_path_ + "/test_sentencepiece/botchan.txt";
|
/// dataset_path = datasets_root_path_ + "/test_sentencepiece/vocab.txt";
|
||||||
/// std::vector<std::string> path_list;
|
/// std::vector<std::string> path_list;
|
||||||
/// path_list.emplace_back(dataset_path);
|
/// path_list.emplace_back(dataset_path);
|
||||||
/// std::unordered_map<std::string, std::string> param_map;
|
/// std::unordered_map<std::string, std::string> param_map;
|
||||||
|
|
|
@ -38,13 +38,13 @@ TEST_F(MindDataTestPipeline, TestSentencePieceVocabSuccess1) {
|
||||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSentencePieceVocabSuccess1 plus sentencepiece tokenizer.";
|
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSentencePieceVocabSuccess1 plus sentencepiece tokenizer.";
|
||||||
|
|
||||||
// Create a TextFile dataset
|
// Create a TextFile dataset
|
||||||
std::string vocab_file = datasets_root_path_ + "/test_sentencepiece/botchan.txt";
|
std::string vocab_file = datasets_root_path_ + "/test_sentencepiece/vocab.txt";
|
||||||
std::shared_ptr<Dataset> ds_vocab = TextFile({vocab_file}, 0, ShuffleMode::kFalse);
|
std::shared_ptr<Dataset> ds_vocab = TextFile({vocab_file}, 0, ShuffleMode::kFalse);
|
||||||
EXPECT_NE(ds_vocab, nullptr);
|
EXPECT_NE(ds_vocab, nullptr);
|
||||||
|
|
||||||
// Create vocab from dataset
|
// Create vocab from dataset
|
||||||
std::shared_ptr<SentencePieceVocab> vocab =
|
std::shared_ptr<SentencePieceVocab> vocab =
|
||||||
ds_vocab->BuildSentencePieceVocab({}, 5000, 0.9995, SentencePieceModel::kUnigram, {});
|
ds_vocab->BuildSentencePieceVocab({}, 100, 0.9995, SentencePieceModel::kUnigram, {});
|
||||||
EXPECT_NE(vocab, nullptr);
|
EXPECT_NE(vocab, nullptr);
|
||||||
|
|
||||||
// Create a TextFile dataset
|
// Create a TextFile dataset
|
||||||
|
@ -70,13 +70,13 @@ TEST_F(MindDataTestPipeline, TestSentencePieceVocabSuccess1) {
|
||||||
ASSERT_OK(iter->GetNextRow(&row));
|
ASSERT_OK(iter->GetNextRow(&row));
|
||||||
|
|
||||||
// Expected result after tokenization
|
// Expected result after tokenization
|
||||||
std::vector<std::string> expected = {"▁I", "▁sa", "w", "▁a", "▁girl", "▁with", "▁a", "▁te", "les", "co", "pe", "."};
|
std::vector<std::string> expected = {"▁", "I", "▁use", "▁MindSpore", "▁", "to", "▁", "t", "r",
|
||||||
|
"a", "i", "n", "▁", "m", "y", "▁model", "."};
|
||||||
std::shared_ptr<Tensor> de_expected_tensor;
|
std::shared_ptr<Tensor> de_expected_tensor;
|
||||||
ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
|
ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
|
||||||
mindspore::MSTensor expected_tensor =
|
auto expected_tensor = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
|
||||||
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
|
|
||||||
|
|
||||||
uint64_t i = 0;
|
uint32_t i = 0;
|
||||||
while (row.size() != 0) {
|
while (row.size() != 0) {
|
||||||
auto txt = row["text"];
|
auto txt = row["text"];
|
||||||
TEST_MS_LOG_MSTENSOR(INFO, "txt: ", txt);
|
TEST_MS_LOG_MSTENSOR(INFO, "txt: ", txt);
|
||||||
|
@ -97,13 +97,13 @@ TEST_F(MindDataTestPipeline, TestSentencePieceVocabSuccess2) {
|
||||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSentencePieceVocabSuccess2 plus sentencepiece tokenizer.";
|
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSentencePieceVocabSuccess2 plus sentencepiece tokenizer.";
|
||||||
|
|
||||||
// Create a TextFile dataset
|
// Create a TextFile dataset
|
||||||
std::string vocab_file = datasets_root_path_ + "/test_sentencepiece/botchan.txt";
|
std::string vocab_file = datasets_root_path_ + "/test_sentencepiece/vocab.txt";
|
||||||
std::shared_ptr<Dataset> ds_vocab = TextFile({vocab_file}, 0, ShuffleMode::kFalse);
|
std::shared_ptr<Dataset> ds_vocab = TextFile({vocab_file}, 0, ShuffleMode::kFalse);
|
||||||
EXPECT_NE(ds_vocab, nullptr);
|
EXPECT_NE(ds_vocab, nullptr);
|
||||||
|
|
||||||
// Create vocab from dataset
|
// Create vocab from dataset
|
||||||
std::shared_ptr<SentencePieceVocab> vocab =
|
std::shared_ptr<SentencePieceVocab> vocab =
|
||||||
ds_vocab->BuildSentencePieceVocab({}, 5000, 0.9995, SentencePieceModel::kUnigram, {});
|
ds_vocab->BuildSentencePieceVocab({}, 100, 0.9995, SentencePieceModel::kUnigram, {});
|
||||||
EXPECT_NE(vocab, nullptr);
|
EXPECT_NE(vocab, nullptr);
|
||||||
|
|
||||||
// Save vocab model to local
|
// Save vocab model to local
|
||||||
|
@ -133,13 +133,13 @@ TEST_F(MindDataTestPipeline, TestSentencePieceVocabSuccess2) {
|
||||||
ASSERT_OK(iter->GetNextRow(&row));
|
ASSERT_OK(iter->GetNextRow(&row));
|
||||||
|
|
||||||
// Expected result after tokenization
|
// Expected result after tokenization
|
||||||
std::vector<std::string> expected = {"▁I", "▁sa", "w", "▁a", "▁girl", "▁with", "▁a", "▁te", "les", "co", "pe", "."};
|
std::vector<std::string> expected = {"▁", "I", "▁use", "▁MindSpore", "▁", "to", "▁", "t", "r",
|
||||||
|
"a", "i", "n", "▁", "m", "y", "▁model", "."};
|
||||||
std::shared_ptr<Tensor> de_expected_tensor;
|
std::shared_ptr<Tensor> de_expected_tensor;
|
||||||
ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
|
ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
|
||||||
mindspore::MSTensor expected_tensor =
|
auto expected_tensor = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
|
||||||
mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
|
|
||||||
|
|
||||||
uint64_t i = 0;
|
uint32_t i = 0;
|
||||||
while (row.size() != 0) {
|
while (row.size() != 0) {
|
||||||
auto txt = row["text"];
|
auto txt = row["text"];
|
||||||
TEST_MS_LOG_MSTENSOR(INFO, "txt: ", txt);
|
TEST_MS_LOG_MSTENSOR(INFO, "txt: ", txt);
|
||||||
|
@ -160,7 +160,7 @@ TEST_F(MindDataTestPipeline, TestSentencePieceVocabFail) {
|
||||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSentencePieceVocabFail1 with incorrect parameter.";
|
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSentencePieceVocabFail1 with incorrect parameter.";
|
||||||
|
|
||||||
// Create a TextFile dataset
|
// Create a TextFile dataset
|
||||||
std::string vocab_file = datasets_root_path_ + "/test_sentencepiece/botchan.txt";
|
std::string vocab_file = datasets_root_path_ + "/test_sentencepiece/vocab.txt";
|
||||||
std::shared_ptr<Dataset> ds_vocab = TextFile({vocab_file}, 0, ShuffleMode::kFalse);
|
std::shared_ptr<Dataset> ds_vocab = TextFile({vocab_file}, 0, ShuffleMode::kFalse);
|
||||||
EXPECT_NE(ds_vocab, nullptr);
|
EXPECT_NE(ds_vocab, nullptr);
|
||||||
|
|
||||||
|
|
|
@ -18,11 +18,9 @@
|
||||||
#include "minddata/dataset/core/de_tensor.h"
|
#include "minddata/dataset/core/de_tensor.h"
|
||||||
#include "minddata/dataset/include/dataset/audio.h"
|
#include "minddata/dataset/include/dataset/audio.h"
|
||||||
#include "minddata/dataset/include/dataset/execute.h"
|
#include "minddata/dataset/include/dataset/execute.h"
|
||||||
#include "minddata/dataset/include/dataset/transforms.h"
|
|
||||||
#include "minddata/dataset/include/dataset/audio.h"
|
|
||||||
#include "minddata/dataset/include/dataset/vision.h"
|
|
||||||
#include "minddata/dataset/include/dataset/audio.h"
|
|
||||||
#include "minddata/dataset/include/dataset/text.h"
|
#include "minddata/dataset/include/dataset/text.h"
|
||||||
|
#include "minddata/dataset/include/dataset/transforms.h"
|
||||||
|
#include "minddata/dataset/include/dataset/vision.h"
|
||||||
#include "minddata/dataset/text/char_n_gram.h"
|
#include "minddata/dataset/text/char_n_gram.h"
|
||||||
#include "minddata/dataset/text/fast_text.h"
|
#include "minddata/dataset/text/fast_text.h"
|
||||||
#include "minddata/dataset/text/glove.h"
|
#include "minddata/dataset/text/glove.h"
|
||||||
|
|
|
@ -16,6 +16,7 @@
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <string_view>
|
#include <string_view>
|
||||||
|
|
||||||
#include "common/common.h"
|
#include "common/common.h"
|
||||||
#include "minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h"
|
#include "minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h"
|
||||||
#include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
|
#include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
|
||||||
|
@ -60,11 +61,11 @@ TEST_F(MindDataTestSentencePieceVocabOp, TestSentencePieceFromFileFuntions) {
|
||||||
MS_LOG(INFO) << "Doing MindDataTestSentencePieceVocabOp TestSentencePieceFromFileFuntions.";
|
MS_LOG(INFO) << "Doing MindDataTestSentencePieceVocabOp TestSentencePieceFromFileFuntions.";
|
||||||
|
|
||||||
std::string dataset_path;
|
std::string dataset_path;
|
||||||
dataset_path = datasets_root_path_ + "/test_sentencepiece/botchan.txt";
|
dataset_path = datasets_root_path_ + "/test_sentencepiece/vocab.txt";
|
||||||
std::vector<std::string> path_list;
|
std::vector<std::string> path_list;
|
||||||
path_list.emplace_back(dataset_path);
|
path_list.emplace_back(dataset_path);
|
||||||
std::unordered_map<std::string, std::string> param_map;
|
std::unordered_map<std::string, std::string> param_map;
|
||||||
std::shared_ptr<SentencePieceVocab> spm = std::make_unique<SentencePieceVocab>();
|
std::shared_ptr<SentencePieceVocab> spm = std::make_unique<SentencePieceVocab>();
|
||||||
Status rc = SentencePieceVocab::BuildFromFile(path_list, 5000, 0.9995, SentencePieceModel::kUnigram, param_map, &spm);
|
Status rc = SentencePieceVocab::BuildFromFile(path_list, 100, 0.9995, SentencePieceModel::kUnigram, param_map, &spm);
|
||||||
ASSERT_TRUE(rc.IsOk());
|
ASSERT_TRUE(rc.IsOk());
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,3 +1,3 @@
|
||||||
{"label": "102", "label_desc": "news_entertainment", "sentence": "新闻1", "keywords": "关键词一,关键词二,关键词三,关键词四"}
|
{"label": "102", "label_desc": "音乐", "sentence": "新闻1", "keywords": "关键词一,关键词二,关键词三,关键词四"}
|
||||||
{"label": "110", "label_desc": "news_military", "sentence": "新闻2", "keywords": "关键词一,关键词二,关键词三,关键词四,关键词五"}
|
{"label": "110", "label_desc": "电影", "sentence": "新闻2", "keywords": "关键词一,关键词二,关键词三,关键词四,关键词五"}
|
||||||
{"label": "104", "label_desc": "news_finance", "sentence": "新闻3", "keywords": "关键词一,关键词二,关键词三,关键词四,关键词五"}
|
{"label": "104", "label_desc": "科技", "sentence": "新闻3", "keywords": "关键词一,关键词二,关键词三,关键词四,关键词五"}
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
I saw a girl with a telescope.
|
I use MindSpore to train my model.
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,89 @@
|
||||||
|
Overall Architecture
|
||||||
|
MindSpore is a deep learning framework in all scenarios, aiming to achieve easy development, efficient execution, and
|
||||||
|
all-scenario coverage. Easy development features include API friendliness and low debugging difficulty. Efficient
|
||||||
|
execution includes computing efficiency, data preprocessing efficiency, and distributed training efficiency.
|
||||||
|
All-scenario coverage means that the framework supports cloud, edge, and device scenarios.
|
||||||
|
ME (MindExpression) provides user-level APIs for scientific computing, building and training neural networks, and
|
||||||
|
converting Python code of users into graphs. For more information about the overall architecture, see Overall
|
||||||
|
Architecture.
|
||||||
|
Design Concept
|
||||||
|
MindSpore originates from the best practices of the entire industry and provides unified model training, inference, and
|
||||||
|
export APIs for data scientists and algorithm engineers. It supports flexible deployment in different scenarios such as
|
||||||
|
the device, edge, and cloud, and promotes the prosperity of domains such as deep learning and scientific computing.
|
||||||
|
MindSpore provides the Python programming paradigm. Users can use the native control logic of Python to build complex
|
||||||
|
neural network models, simplifying AI programming. For details, see Quick Start for Beginners.
|
||||||
|
Currently, there are two execution modes of a mainstream deep learning framework: a static graph mode and a dynamic
|
||||||
|
graph mode. The static graph mode has a relatively high training performance, but is difficult to debug. On the
|
||||||
|
contrary, the dynamic graph mode is easy to debug, but is difficult to execute efficiently. MindSpore provides an
|
||||||
|
encoding mode that unifies dynamic and static graphs, which greatly improves the compatibility between static and
|
||||||
|
dynamic graphs. Instead of developing multiple sets of code, users can switch between the dynamic and static graph
|
||||||
|
modes by changing only one line of code. For example, set context.set_context(mode=context.PYNATIVE_MODE) to switch to
|
||||||
|
the dynamic graph mode, or set context.set_context(mode=context.GRAPH_MODE) to switch to the static graph mode, which
|
||||||
|
facilitates development and debugging, and improves performance experience.
|
||||||
|
A neural network model is usually trained based on gradient descent algorithm, but the manual derivation process is
|
||||||
|
complex and the result is prone to errors. The automatic differentiation mechanism of MindSpore based on source code
|
||||||
|
transformation (SCT) uses a functional differential programming architecture and provides Python APIs at the API layer,
|
||||||
|
including the expression of control flows. Users can focus on the native mathematical expression of the model algorithm
|
||||||
|
without manual derivation. The sample code for automatic differentiation is as follows:
|
||||||
|
In the first step, a function (computational graph) is defined. In the second step, automatic differentiation is
|
||||||
|
performed by using a backward API provided by MindSpore, and the first derivative function (computational graph) is
|
||||||
|
defined. In the third step, the second derivative function (computational graph) is defined. After the input is given,
|
||||||
|
the second derivative of the function defined in step 1 can be obtained at the specified position. The result of the
|
||||||
|
second derivative is 12.
|
||||||
|
In addition, the SCT can convert Python code into an intermediate representation (IR) of a MindSpore function. The IR
|
||||||
|
constructs a computational graph that can be parsed and executed on different devices. Before the computational graph
|
||||||
|
is executed, a plurality of software and hardware collaborative optimization technologies are used, and performance and
|
||||||
|
efficiency in different scenarios such as device, edge, and cloud, are improved.
|
||||||
|
Improving the data processing capability to match the computing power of AI chips is the key to ensure the ultimate
|
||||||
|
performance of AI chips. MindSpore provides multiple data processing operators and uses automatic data acceleration
|
||||||
|
technology to implement high-performance pipelines, including data loading, data demonstration, and data conversion. It
|
||||||
|
supports data processing capabilities in all scenarios, such as CV, NLP, and GNN. MindRecord is a self-developed data
|
||||||
|
format of MindSpore. It features efficient read and write and easy distributed processing. Users can convert
|
||||||
|
non-standard and common datasets to the MindRecord format to obtain better performance experience. For details about
|
||||||
|
the conversion, see MindSpore Data Format Conversion. MindSpore supports the loading of common datasets and datasets in
|
||||||
|
multiple data storage formats. For example, users can use dataset=dataset.Cifar10Dataset("Cifar10Data/") to load the
|
||||||
|
CIFAR-10 dataset. Cifar10Data/ indicates the local directory of the dataset, and users can also use GeneratorDataset to
|
||||||
|
customize the dataset loading mode. Data augmentation is a method of generating new data based on (limited) data, which
|
||||||
|
can reduce the overfitting phenomenon of network model and improve the generalization ability of the model. In addition
|
||||||
|
to user-defined data augmentation, MindSpore provides automatic data augmentation, making data augmentation more
|
||||||
|
flexible. For details, see Automatic Data Augmentation.
|
||||||
|
The deep learning neural network model usually contains many hidden layers for feature extraction. However, the feature
|
||||||
|
extraction is random and the debugging process is invisible, which limits the trustworthiness and optimization of the
|
||||||
|
deep learning technology. MindSpore supports visualized debugging and optimization (MindInsight) and provides functions
|
||||||
|
such as training dashboard, lineage, performance analysis, and debugger to help users detect deviations during model
|
||||||
|
training and easily debug and optimize models. For example, before initializing the network, users can use
|
||||||
|
profiler=Profiler() to initialize the Profiler object, automatically collect information such as the operator time
|
||||||
|
consumption during training, and record the information in a file. After the training is complete, call
|
||||||
|
profiler.analyse() to stop collecting data and generate performance analysis results. Users can view and analyze the
|
||||||
|
visualized results to more efficiently debug network performance. For details about debugging and optimization, see
|
||||||
|
Training Process Visualization.
|
||||||
|
As a scale of neural network models and datasets continuously increases, parallel distributed training becomes a common
|
||||||
|
practice of neural network training. However, policy selection and compilation of parallel distributed training are
|
||||||
|
very complex, which severely restricts training efficiency of a deep learning model and hinders development of deep
|
||||||
|
learning. MindSpore unifies the encoding methods of standalone and distributed training. Developers do not need to
|
||||||
|
write complex distributed policies. Instead, they can implement distributed training by adding a small amount of codes
|
||||||
|
to the standalone code. For example, after context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL)
|
||||||
|
is set, a cost model can be automatically established, and a better parallel mode can be selected for users. This
|
||||||
|
improves the training efficiency of neural networks, greatly decreases the AI development difficulty, and enables users
|
||||||
|
to quickly implement model. For more information, see Distributed Training.
|
||||||
|
Level Structure
|
||||||
|
To support network building, entire graph execution, subgraph execution, and single-operator execution, MindSpore
|
||||||
|
provides users with three levels of APIs which are Low-Level Python API, Medium-Level Python API, and High-Level Python
|
||||||
|
API in ascending order.
|
||||||
|
Low-Level Python API
|
||||||
|
The first is low-level API, including tensor definition, basic operators, and automatic differential modules. Users can
|
||||||
|
use the low-level API to easily define tensors and perform derivative calculation. For example, users can customize
|
||||||
|
tensors by using the Tensor API, and use the GradOperation operator in the ops.composite module to calculate the
|
||||||
|
derivative of the function at a specified position.
|
||||||
|
Medium-Level Python API
|
||||||
|
The second is medium-level API which encapsulates low-cost APIs and provides modules such as the network layer,
|
||||||
|
optimizer, and loss function. Users can flexibly build neural networks and control execution processes through the
|
||||||
|
medium-level API to quickly implement model algorithm logic. For example, users can call the Cell API to build neural
|
||||||
|
network models and computing logic, add the loss function and optimization methods to the neural network model by using
|
||||||
|
the loss module and Optimizer API, and use the dataset module to process data for model training and evaluation.
|
||||||
|
High-Level Python API
|
||||||
|
The third is high-level API. Based on the medium-level API, it provides advanced APIs such as training and inference
|
||||||
|
management, mixed precision training, and debugging and optimization, facilitating users to control the execution
|
||||||
|
process of the entire network and implement training, inference, and optimization of the neural network. For example,
|
||||||
|
users can use the Model API, specify the neural network model to be trained and related training settings, train the
|
||||||
|
neural network model, and debug the neural network performance through the Profiler API.
|
|
@ -1 +1 @@
|
||||||
Homelessness (or Houselessness as George Carlin stated) has been an issue for years but never a plan to help those on the street that were once considered human who did everything from going to school, work, or vote for the matter. Most people think of the homeless as just a lost cause while worrying about things such as racism, the war on Iraq, pressuring kids to succeed, technology, the elections, inflation, or worrying if they'll be next to end up on the streets.<br /><br />But what if you were given a bet to live on the streets for a month without the luxuries you once had from a home, the entertainment sets, a bathroom, pictures on the wall, a computer, and everything you once treasure to see what it's like to be homeless? That is Goddard Bolt's lesson.<br /><br />Mel Brooks (who directs) who stars as Bolt plays a rich man who has everything in the world until deciding to make a bet with a sissy rival (Jeffery Tambor) to see if he can live in the streets for thirty days without the luxuries; if Bolt succeeds, he can do what he wants with a future project of making more buildings. The bet's on where Bolt is thrown on the street with a bracelet on his leg to monitor his every move where he can't step off the sidewalk. He's given the nickname Pepto by a vagrant after it's written on his forehead where Bolt meets other characters including a woman by the name of Molly (Lesley Ann Warren) an ex-dancer who got divorce before losing her home, and her pals Sailor (Howard Morris) and Fumes (Teddy Wilson) who are already used to the streets. They're survivors. Bolt isn't. He's not used to reaching mutual agreements like he once did when being rich where it's fight or flight, kill or be killed.<br /><br />While the love connection between Molly and Bolt wasn't necessary to plot, I found "Life Stinks" to be one of Mel Brooks' observant films where prior to being a comedy, it shows a tender side compared to his slapstick work such as Blazing Saddles, Young Frankenstein, or Spaceballs for the matter, to show what it's like having something valuable before losing it the next day or on the other hand making a stupid bet like all rich people do when they don't know what to do with their money. Maybe they should give it to the homeless instead of using it like Monopoly money.<br /><br />Or maybe this film will inspire you to help others.
|
MindSpore is a deep learning framework in all scenarios, aiming to achieve easy development, efficient execution, and all-scenario coverage.<br /><br />Easy development features include API friendliness and low debugging difficulty.<br /><br />Efficient execution includes computing efficiency, data preprocessing efficiency, and distributed training efficiency.<br /><br />All-scenario coverage means that the framework supports cloud, edge, and device scenarios.<br /><br />ME (MindExpression) provides user-level APIs for scientific computing, building and training neural networks, and converting Python code of users into graphs.<br /><br />For more information about the overall architecture, see Overall Architecture.
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -18,60 +18,86 @@ import mindspore.dataset.text as text
|
||||||
import mindspore.dataset as ds
|
import mindspore.dataset as ds
|
||||||
from mindspore.dataset.text import SentencePieceModel, to_str, SPieceTokenizerOutType
|
from mindspore.dataset.text import SentencePieceModel, to_str, SPieceTokenizerOutType
|
||||||
|
|
||||||
VOCAB_FILE = "../data/dataset/test_sentencepiece/botchan.txt"
|
VOCAB_FILE = "../data/dataset/test_sentencepiece/vocab.txt"
|
||||||
DATA_FILE = "../data/dataset/testTokenizerData/sentencepiece_tokenizer.txt"
|
DATA_FILE = "../data/dataset/testTokenizerData/sentencepiece_tokenizer.txt"
|
||||||
|
|
||||||
|
|
||||||
def test_sentence_piece_tokenizer_callable():
|
def test_sentence_piece_tokenizer_callable():
|
||||||
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {})
|
"""
|
||||||
|
Feature: SentencePieceTokenizer
|
||||||
|
Description: test SentencePieceTokenizer with eager mode
|
||||||
|
Expectation: output is equal to the expected value
|
||||||
|
"""
|
||||||
|
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 100, 0.9995, SentencePieceModel.UNIGRAM, {})
|
||||||
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
|
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
|
||||||
data = '123'
|
data = "123"
|
||||||
assert np.array_equal(tokenizer(data), ['▁', '12', '3'])
|
assert np.array_equal(tokenizer(data), ["▁", "1", "23"])
|
||||||
|
|
||||||
|
|
||||||
def test_from_vocab_to_str_UNIGRAM():
|
def test_from_vocab_to_str_unigram():
|
||||||
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {})
|
"""
|
||||||
|
Feature: SentencePieceTokenizer
|
||||||
|
Description: test SentencePieceTokenizer with UNIGRAM model
|
||||||
|
Expectation: output is equal to the expected value
|
||||||
|
"""
|
||||||
|
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 100, 0.9995, SentencePieceModel.UNIGRAM, {})
|
||||||
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
|
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
|
||||||
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
||||||
dataset = dataset.map(operations=tokenizer)
|
dataset = dataset.map(operations=tokenizer)
|
||||||
expect = ['▁I', '▁sa', 'w', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'co', 'pe', '.']
|
expect = ["▁", "I", "▁use", "▁MindSpore", "▁", "to", "▁", "t", "r", "a", "i", "n", "▁", "m", "y", "▁model", "."]
|
||||||
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
|
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
|
||||||
ret = to_str(i["text"])
|
ret = to_str(i["text"])
|
||||||
for key, value in enumerate(ret):
|
for key, value in enumerate(ret):
|
||||||
assert value == expect[key]
|
assert value == expect[key]
|
||||||
|
|
||||||
|
|
||||||
def test_from_vocab_to_str_BPE():
|
def test_from_vocab_to_str_bpe():
|
||||||
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.BPE, {})
|
"""
|
||||||
|
Feature: SentencePieceTokenizer
|
||||||
|
Description: test SentencePieceTokenizer with BPE model
|
||||||
|
Expectation: output is equal to the expected value
|
||||||
|
"""
|
||||||
|
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 100, 0.9995, SentencePieceModel.BPE, {})
|
||||||
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
|
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
|
||||||
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
||||||
dataset = dataset.map(operations=tokenizer)
|
dataset = dataset.map(operations=tokenizer)
|
||||||
expect = ['▁I', '▁saw', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'c', 'ope', '.']
|
expect = ["▁", "I", "▁", "u", "s", "e", "▁", "M", "in", "d", "S", "p", "or", "e", "▁t", "o", "▁t", "ra", "in", "▁m",
|
||||||
|
"y", "▁m", "ode", "l", "."]
|
||||||
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
|
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
|
||||||
ret = to_str(i["text"])
|
ret = to_str(i["text"])
|
||||||
for key, value in enumerate(ret):
|
for key, value in enumerate(ret):
|
||||||
assert value == expect[key]
|
assert value == expect[key]
|
||||||
|
|
||||||
|
|
||||||
def test_from_vocab_to_str_CHAR():
|
def test_from_vocab_to_str_char():
|
||||||
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.CHAR, {})
|
"""
|
||||||
|
Feature: SentencePieceTokenizer
|
||||||
|
Description: test SentencePieceTokenizer with CHAR model
|
||||||
|
Expectation: output is equal to the expected value
|
||||||
|
"""
|
||||||
|
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 100, 0.9995, SentencePieceModel.CHAR, {})
|
||||||
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
|
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
|
||||||
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
||||||
dataset = dataset.map(operations=tokenizer)
|
dataset = dataset.map(operations=tokenizer)
|
||||||
expect = ['▁', 'I', '▁', 's', 'a', 'w', '▁', 'a', '▁', 'g', 'i', 'r', 'l', '▁', 'w', 'i', 't', 'h',\
|
expect = ["▁", "I", "▁", "u", "s", "e", "▁", "M", "i", "n", "d", "S", "p", "o", "r", "e", "▁", "t", "o", "▁", "t",
|
||||||
'▁', 'a', '▁', 't', 'e', 'l', 'e', 's', 'c', 'o', 'p', 'e', '.']
|
"r", "a", "i", "n", "▁", "m", "y", "▁", "m", "o", "d", "e", "l", "."]
|
||||||
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
|
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
|
||||||
ret = to_str(i["text"])
|
ret = to_str(i["text"])
|
||||||
for key, value in enumerate(ret):
|
for key, value in enumerate(ret):
|
||||||
assert value == expect[key]
|
assert value == expect[key]
|
||||||
|
|
||||||
|
|
||||||
def test_from_vocab_to_str_WORD():
|
def test_from_vocab_to_str_word():
|
||||||
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.WORD, {})
|
"""
|
||||||
|
Feature: SentencePieceTokenizer
|
||||||
|
Description: test SentencePieceTokenizer with WORD model
|
||||||
|
Expectation: output is equal to the expected value
|
||||||
|
"""
|
||||||
|
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 100, 0.9995, SentencePieceModel.WORD, {})
|
||||||
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
|
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
|
||||||
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
||||||
dataset = dataset.map(operations=tokenizer)
|
dataset = dataset.map(operations=tokenizer)
|
||||||
expect = ['▁I', '▁saw', '▁a', '▁girl', '▁with', '▁a', '▁telescope.']
|
expect = ["▁I", "▁use", "▁MindSpore", "▁to", "▁train▁my▁model."]
|
||||||
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
|
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
|
||||||
ret = to_str(i["text"])
|
ret = to_str(i["text"])
|
||||||
for key, value in enumerate(ret):
|
for key, value in enumerate(ret):
|
||||||
|
@ -79,11 +105,16 @@ def test_from_vocab_to_str_WORD():
|
||||||
|
|
||||||
|
|
||||||
def test_from_vocab_to_int():
|
def test_from_vocab_to_int():
|
||||||
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {})
|
"""
|
||||||
|
Feature: SentencePieceTokenizer
|
||||||
|
Description: test SentencePieceTokenizer with out_type equal to int
|
||||||
|
Expectation: output is equal to the expected value
|
||||||
|
"""
|
||||||
|
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 100, 0.9995, SentencePieceModel.UNIGRAM, {})
|
||||||
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.INT)
|
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.INT)
|
||||||
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
||||||
dataset = dataset.map(operations=tokenizer)
|
dataset = dataset.map(operations=tokenizer)
|
||||||
expect = [6, 329, 183, 8, 945, 23, 8, 3783, 4382, 4641, 1405, 4]
|
expect = [3, 41, 59, 53, 3, 29, 3, 6, 12, 99, 7, 10, 3, 11, 20, 45, 19]
|
||||||
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
|
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
|
||||||
ret = i["text"]
|
ret = i["text"]
|
||||||
for key, value in enumerate(ret):
|
for key, value in enumerate(ret):
|
||||||
|
@ -91,12 +122,17 @@ def test_from_vocab_to_int():
|
||||||
|
|
||||||
|
|
||||||
def test_from_file_to_str():
|
def test_from_file_to_str():
|
||||||
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {})
|
"""
|
||||||
|
Feature: SentencePieceTokenizer
|
||||||
|
Description: test SentencePieceTokenizer with out_type equal to string
|
||||||
|
Expectation: output is equal to the expected value
|
||||||
|
"""
|
||||||
|
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 100, 0.9995, SentencePieceModel.UNIGRAM, {})
|
||||||
text.SentencePieceVocab.save_model(vocab, "./", "m.model")
|
text.SentencePieceVocab.save_model(vocab, "./", "m.model")
|
||||||
tokenizer = text.SentencePieceTokenizer("./m.model", out_type=SPieceTokenizerOutType.STRING)
|
tokenizer = text.SentencePieceTokenizer("./m.model", out_type=SPieceTokenizerOutType.STRING)
|
||||||
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
||||||
dataset = dataset.map(operations=tokenizer)
|
dataset = dataset.map(operations=tokenizer)
|
||||||
expect = ['▁I', '▁sa', 'w', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'co', 'pe', '.']
|
expect = ["▁", "I", "▁use", "▁MindSpore", "▁", "to", "▁", "t", "r", "a", "i", "n", "▁", "m", "y", "▁model", "."]
|
||||||
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
|
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
|
||||||
ret = to_str(i["text"])
|
ret = to_str(i["text"])
|
||||||
for key, value in enumerate(ret):
|
for key, value in enumerate(ret):
|
||||||
|
@ -104,12 +140,17 @@ def test_from_file_to_str():
|
||||||
|
|
||||||
|
|
||||||
def test_from_file_to_int():
|
def test_from_file_to_int():
|
||||||
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {})
|
"""
|
||||||
|
Feature: SentencePieceTokenizer
|
||||||
|
Description: test SentencePieceTokenizer while loading vocab model from file
|
||||||
|
Expectation: output is equal to the expected value
|
||||||
|
"""
|
||||||
|
vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 100, 0.9995, SentencePieceModel.UNIGRAM, {})
|
||||||
text.SentencePieceVocab.save_model(vocab, "./", "m.model")
|
text.SentencePieceVocab.save_model(vocab, "./", "m.model")
|
||||||
tokenizer = text.SentencePieceTokenizer("./m.model", out_type=SPieceTokenizerOutType.INT)
|
tokenizer = text.SentencePieceTokenizer("./m.model", out_type=SPieceTokenizerOutType.INT)
|
||||||
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
||||||
dataset = dataset.map(operations=tokenizer)
|
dataset = dataset.map(operations=tokenizer)
|
||||||
expect = [6, 329, 183, 8, 945, 23, 8, 3783, 4382, 4641, 1405, 4]
|
expect = [3, 41, 59, 53, 3, 29, 3, 6, 12, 99, 7, 10, 3, 11, 20, 45, 19]
|
||||||
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
|
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
|
||||||
ret = i["text"]
|
ret = i["text"]
|
||||||
for key, value in enumerate(ret):
|
for key, value in enumerate(ret):
|
||||||
|
@ -117,12 +158,17 @@ def test_from_file_to_int():
|
||||||
|
|
||||||
|
|
||||||
def test_build_from_dataset():
|
def test_build_from_dataset():
|
||||||
|
"""
|
||||||
|
Feature: SentencePieceTokenizer
|
||||||
|
Description: test SentencePieceTokenizer while loading vocab model from dataset
|
||||||
|
Expectation: output is equal to the expected value
|
||||||
|
"""
|
||||||
data = ds.TextFileDataset(VOCAB_FILE, shuffle=False)
|
data = ds.TextFileDataset(VOCAB_FILE, shuffle=False)
|
||||||
vocab = text.SentencePieceVocab.from_dataset(data, ["text"], 5000, 0.9995, SentencePieceModel.UNIGRAM, {})
|
vocab = text.SentencePieceVocab.from_dataset(data, ["text"], 100, 0.9995, SentencePieceModel.UNIGRAM, {})
|
||||||
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
|
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
|
||||||
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
||||||
dataset = dataset.map(operations=tokenizer)
|
dataset = dataset.map(operations=tokenizer)
|
||||||
expect = ['▁I', '▁sa', 'w', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'co', 'pe', '.']
|
expect = ["▁", "I", "▁use", "▁MindSpore", "▁", "to", "▁", "t", "r", "a", "i", "n", "▁", "m", "y", "▁model", "."]
|
||||||
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
|
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
|
||||||
ret = to_str(i["text"])
|
ret = to_str(i["text"])
|
||||||
for key, value in enumerate(ret):
|
for key, value in enumerate(ret):
|
||||||
|
@ -130,8 +176,8 @@ def test_build_from_dataset():
|
||||||
|
|
||||||
|
|
||||||
def apply_func(dataset):
|
def apply_func(dataset):
|
||||||
input_columns = ['text']
|
input_columns = ["text"]
|
||||||
output_columns = ['text2']
|
output_columns = ["text2"]
|
||||||
dataset = dataset.rename(input_columns, output_columns)
|
dataset = dataset.rename(input_columns, output_columns)
|
||||||
return dataset
|
return dataset
|
||||||
|
|
||||||
|
@ -141,7 +187,7 @@ def zip_test(dataset):
|
||||||
dataset_2 = copy.deepcopy(dataset)
|
dataset_2 = copy.deepcopy(dataset)
|
||||||
dataset_1 = dataset_1.apply(apply_func)
|
dataset_1 = dataset_1.apply(apply_func)
|
||||||
dataset_zip = ds.zip((dataset_1, dataset_2))
|
dataset_zip = ds.zip((dataset_1, dataset_2))
|
||||||
expect = ['▁I', '▁sa', 'w', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'co', 'pe', '.']
|
expect = ["▁", "I", "▁use", "▁MindSpore", "▁", "to", "▁", "t", "r", "a", "i", "n", "▁", "m", "y", "▁model", "."]
|
||||||
for i in dataset_zip.create_dict_iterator(num_epochs=1, output_numpy=True):
|
for i in dataset_zip.create_dict_iterator(num_epochs=1, output_numpy=True):
|
||||||
ret = to_str(i["text"])
|
ret = to_str(i["text"])
|
||||||
for key, value in enumerate(ret):
|
for key, value in enumerate(ret):
|
||||||
|
@ -151,15 +197,21 @@ def zip_test(dataset):
|
||||||
def concat_test(dataset):
|
def concat_test(dataset):
|
||||||
dataset_1 = copy.deepcopy(dataset)
|
dataset_1 = copy.deepcopy(dataset)
|
||||||
dataset = dataset.concat(dataset_1)
|
dataset = dataset.concat(dataset_1)
|
||||||
expect = ['▁I', '▁sa', 'w', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'co', 'pe', '.']
|
expect = ["▁", "I", "▁use", "▁MindSpore", "▁", "to", "▁", "t", "r", "a", "i", "n", "▁", "m", "y", "▁model", "."]
|
||||||
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
|
for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
|
||||||
ret = to_str(i["text"])
|
ret = to_str(i["text"])
|
||||||
for key, value in enumerate(ret):
|
for key, value in enumerate(ret):
|
||||||
assert value == expect[key]
|
assert value == expect[key]
|
||||||
|
|
||||||
|
|
||||||
def test_with_zip_concat():
|
def test_with_zip_concat():
|
||||||
|
"""
|
||||||
|
Feature: SentencePieceTokenizer
|
||||||
|
Description: test SentencePieceTokenizer with zip and concat operations
|
||||||
|
Expectation: output is equal to the expected value
|
||||||
|
"""
|
||||||
data = ds.TextFileDataset(VOCAB_FILE, shuffle=False)
|
data = ds.TextFileDataset(VOCAB_FILE, shuffle=False)
|
||||||
vocab = text.SentencePieceVocab.from_dataset(data, ["text"], 5000, 0.9995, SentencePieceModel.UNIGRAM, {})
|
vocab = text.SentencePieceVocab.from_dataset(data, ["text"], 100, 0.9995, SentencePieceModel.UNIGRAM, {})
|
||||||
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
|
tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
|
||||||
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
|
||||||
dataset = dataset.map(operations=tokenizer, num_parallel_workers=2)
|
dataset = dataset.map(operations=tokenizer, num_parallel_workers=2)
|
||||||
|
@ -169,10 +221,10 @@ def test_with_zip_concat():
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
test_sentence_piece_tokenizer_callable()
|
test_sentence_piece_tokenizer_callable()
|
||||||
test_from_vocab_to_str_UNIGRAM()
|
test_from_vocab_to_str_unigram()
|
||||||
test_from_vocab_to_str_BPE()
|
test_from_vocab_to_str_bpe()
|
||||||
test_from_vocab_to_str_CHAR()
|
test_from_vocab_to_str_char()
|
||||||
test_from_vocab_to_str_WORD()
|
test_from_vocab_to_str_word()
|
||||||
test_from_vocab_to_int()
|
test_from_vocab_to_int()
|
||||||
test_from_file_to_str()
|
test_from_file_to_str()
|
||||||
test_from_file_to_int()
|
test_from_file_to_int()
|
||||||
|
|
Loading…
Reference in New Issue