!33475 Remove test data containing sensitive word

Merge pull request !33475 from xiaotianci/fix_examples
2022-04-28 02:05:09 +00:00 · 2022-04-28 02:05:09 +00:00 · fd22d69ca7
parent 19311beac9 0dc85a15b9
commit fd22d69ca7
11 changed files with 203 additions and 34842 deletions
--- a/mindspore/ccsrc/minddata/dataset/include/dataset/text.h
+++ b/mindspore/ccsrc/minddata/dataset/include/dataset/text.h
@ -192,7 +192,7 @@ class SentencePieceVocab {
  /// \par Example
  /// \code
  ///     std::string dataset_path;
-  ///     dataset_path = datasets_root_path_ + "/test_sentencepiece/botchan.txt";
+  ///     dataset_path = datasets_root_path_ + "/test_sentencepiece/vocab.txt";
  ///     std::vector<std::string> path_list;
  ///     path_list.emplace_back(dataset_path);
  ///     std::unordered_map<std::string, std::string> param_map;
--- a/tests/ut/cpp/dataset/c_api_text_sentence_piece_vocab_test.cc
+++ b/tests/ut/cpp/dataset/c_api_text_sentence_piece_vocab_test.cc
@ -38,13 +38,13 @@ TEST_F(MindDataTestPipeline, TestSentencePieceVocabSuccess1) {
  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSentencePieceVocabSuccess1 plus sentencepiece tokenizer.";
  // Create a TextFile dataset
-  std::string vocab_file = datasets_root_path_ + "/test_sentencepiece/botchan.txt";
+  std::string vocab_file = datasets_root_path_ + "/test_sentencepiece/vocab.txt";
  std::shared_ptr<Dataset> ds_vocab = TextFile({vocab_file}, 0, ShuffleMode::kFalse);
  EXPECT_NE(ds_vocab, nullptr);
  // Create vocab from dataset
  std::shared_ptr<SentencePieceVocab> vocab =
-    ds_vocab->BuildSentencePieceVocab({}, 5000, 0.9995, SentencePieceModel::kUnigram, {});
+    ds_vocab->BuildSentencePieceVocab({}, 100, 0.9995, SentencePieceModel::kUnigram, {});
  EXPECT_NE(vocab, nullptr);
  // Create a TextFile dataset
@ -70,13 +70,13 @@ TEST_F(MindDataTestPipeline, TestSentencePieceVocabSuccess1) {
  ASSERT_OK(iter->GetNextRow(&row));
  // Expected result after tokenization
-  std::vector<std::string> expected = {"▁I", "▁sa", "w", "▁a", "▁girl", "▁with", "▁a", "▁te", "les", "co", "pe", "."};
+  std::vector<std::string> expected = {"▁", "I", "▁use", "▁MindSpore", "▁", "to", "▁",      "t", "r",
                                       "a", "i", "n",    "▁",          "m", "y",  "▁model", "."};
  std::shared_ptr<Tensor> de_expected_tensor;
  ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
-  mindspore::MSTensor expected_tensor =
+  auto expected_tensor = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
    mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
-  uint64_t i = 0;
+  uint32_t i = 0;
  while (row.size() != 0) {
    auto txt = row["text"];
    TEST_MS_LOG_MSTENSOR(INFO, "txt: ", txt);
@ -97,13 +97,13 @@ TEST_F(MindDataTestPipeline, TestSentencePieceVocabSuccess2) {
  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSentencePieceVocabSuccess2 plus sentencepiece tokenizer.";
  // Create a TextFile dataset
-  std::string vocab_file = datasets_root_path_ + "/test_sentencepiece/botchan.txt";
+  std::string vocab_file = datasets_root_path_ + "/test_sentencepiece/vocab.txt";
  std::shared_ptr<Dataset> ds_vocab = TextFile({vocab_file}, 0, ShuffleMode::kFalse);
  EXPECT_NE(ds_vocab, nullptr);
  // Create vocab from dataset
  std::shared_ptr<SentencePieceVocab> vocab =
-    ds_vocab->BuildSentencePieceVocab({}, 5000, 0.9995, SentencePieceModel::kUnigram, {});
+    ds_vocab->BuildSentencePieceVocab({}, 100, 0.9995, SentencePieceModel::kUnigram, {});
  EXPECT_NE(vocab, nullptr);
  // Save vocab model to local
@ -133,13 +133,13 @@ TEST_F(MindDataTestPipeline, TestSentencePieceVocabSuccess2) {
  ASSERT_OK(iter->GetNextRow(&row));
  // Expected result after tokenization
-  std::vector<std::string> expected = {"▁I", "▁sa", "w", "▁a", "▁girl", "▁with", "▁a", "▁te", "les", "co", "pe", "."};
+  std::vector<std::string> expected = {"▁", "I", "▁use", "▁MindSpore", "▁", "to", "▁",      "t", "r",
                                       "a", "i", "n",    "▁",          "m", "y",  "▁model", "."};
  std::shared_ptr<Tensor> de_expected_tensor;
  ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
-  mindspore::MSTensor expected_tensor =
+  auto expected_tensor = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
    mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
-  uint64_t i = 0;
+  uint32_t i = 0;
  while (row.size() != 0) {
    auto txt = row["text"];
    TEST_MS_LOG_MSTENSOR(INFO, "txt: ", txt);
@ -160,7 +160,7 @@ TEST_F(MindDataTestPipeline, TestSentencePieceVocabFail) {
  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSentencePieceVocabFail1 with incorrect parameter.";
  // Create a TextFile dataset
-  std::string vocab_file = datasets_root_path_ + "/test_sentencepiece/botchan.txt";
+  std::string vocab_file = datasets_root_path_ + "/test_sentencepiece/vocab.txt";
  std::shared_ptr<Dataset> ds_vocab = TextFile({vocab_file}, 0, ShuffleMode::kFalse);
  EXPECT_NE(ds_vocab, nullptr);
--- a/tests/ut/cpp/dataset/execute_test.cc
+++ b/tests/ut/cpp/dataset/execute_test.cc
@ -18,11 +18,9 @@
 #include "minddata/dataset/core/de_tensor.h"
 #include "minddata/dataset/include/dataset/audio.h"
 #include "minddata/dataset/include/dataset/execute.h"
 #include "minddata/dataset/include/dataset/transforms.h"
 #include "minddata/dataset/include/dataset/audio.h"
 #include "minddata/dataset/include/dataset/vision.h"
 #include "minddata/dataset/include/dataset/audio.h"
 #include "minddata/dataset/include/dataset/text.h"
 #include "minddata/dataset/include/dataset/transforms.h"
 #include "minddata/dataset/include/dataset/vision.h"
 #include "minddata/dataset/text/char_n_gram.h"
 #include "minddata/dataset/text/fast_text.h"
 #include "minddata/dataset/text/glove.h"
--- a/tests/ut/cpp/dataset/sentence_piece_vocab_op_test.cc
+++ b/tests/ut/cpp/dataset/sentence_piece_vocab_op_test.cc
@ -16,6 +16,7 @@
 #include <string>
 #include <string_view>
 #include "common/common.h"
 #include "minddata/dataset/engine/datasetops/build_sentence_piece_vocab_op.h"
 #include "minddata/dataset/text/kernels/sentence_piece_tokenizer_op.h"
@ -60,11 +61,11 @@ TEST_F(MindDataTestSentencePieceVocabOp, TestSentencePieceFromFileFuntions) {
  MS_LOG(INFO) << "Doing MindDataTestSentencePieceVocabOp  TestSentencePieceFromFileFuntions.";
  std::string dataset_path;
-  dataset_path = datasets_root_path_ + "/test_sentencepiece/botchan.txt";
+  dataset_path = datasets_root_path_ + "/test_sentencepiece/vocab.txt";
  std::vector<std::string> path_list;
  path_list.emplace_back(dataset_path);
  std::unordered_map<std::string, std::string> param_map;
  std::shared_ptr<SentencePieceVocab> spm = std::make_unique<SentencePieceVocab>();
-  Status rc = SentencePieceVocab::BuildFromFile(path_list, 5000, 0.9995, SentencePieceModel::kUnigram, param_map, &spm);
+  Status rc = SentencePieceVocab::BuildFromFile(path_list, 100, 0.9995, SentencePieceModel::kUnigram, param_map, &spm);
  ASSERT_TRUE(rc.IsOk());
 }
--- a/tests/ut/data/dataset/testCLUE/tnews/dev.json
+++ b/tests/ut/data/dataset/testCLUE/tnews/dev.json
@ -1,3 +1,3 @@
-{"label": "102", "label_desc": "news_entertainment", "sentence": "新闻1", "keywords": "关键词一,关键词二,关键词三,关键词四"}
+{"label": "102", "label_desc": "音乐", "sentence": "新闻1", "keywords": "关键词一,关键词二,关键词三,关键词四"}
-{"label": "110", "label_desc": "news_military", "sentence": "新闻2", "keywords": "关键词一,关键词二,关键词三,关键词四，关键词五"}
+{"label": "110", "label_desc": "电影", "sentence": "新闻2", "keywords": "关键词一,关键词二,关键词三,关键词四，关键词五"}
-{"label": "104", "label_desc": "news_finance", "sentence": "新闻3", "keywords": "关键词一,关键词二,关键词三,关键词四,关键词五"}
+{"label": "104", "label_desc": "科技", "sentence": "新闻3", "keywords": "关键词一,关键词二,关键词三,关键词四,关键词五"}
--- a/tests/ut/data/dataset/testTokenizerData/sentencepiece_tokenizer.txt
+++ b/tests/ut/data/dataset/testTokenizerData/sentencepiece_tokenizer.txt
@ -1 +1 @@
-I saw a girl with a telescope.
+I use MindSpore to train my model.
--- a/tests/ut/data/dataset/test_sentencepiece/botchan.txt
+++ b/tests/ut/data/dataset/test_sentencepiece/botchan.txt
--- a/tests/ut/data/dataset/test_sentencepiece/vocab.txt
+++ b/tests/ut/data/dataset/test_sentencepiece/vocab.txt
@ -0,0 +1,89 @@
 Overall Architecture
 MindSpore is a deep learning framework in all scenarios, aiming to achieve easy development, efficient execution, and
 all-scenario coverage. Easy development features include API friendliness and low debugging difficulty. Efficient
 execution includes computing efficiency, data preprocessing efficiency, and distributed training efficiency.
 All-scenario coverage means that the framework supports cloud, edge, and device scenarios.
 ME (MindExpression) provides user-level APIs for scientific computing, building and training neural networks, and
 converting Python code of users into graphs. For more information about the overall architecture, see Overall
 Architecture.
 Design Concept
 MindSpore originates from the best practices of the entire industry and provides unified model training, inference, and
 export APIs for data scientists and algorithm engineers. It supports flexible deployment in different scenarios such as
 the device, edge, and cloud, and promotes the prosperity of domains such as deep learning and scientific computing.
 MindSpore provides the Python programming paradigm. Users can use the native control logic of Python to build complex
 neural network models, simplifying AI programming. For details, see Quick Start for Beginners.
 Currently, there are two execution modes of a mainstream deep learning framework: a static graph mode and a dynamic
 graph mode. The static graph mode has a relatively high training performance, but is difficult to debug. On the
 contrary, the dynamic graph mode is easy to debug, but is difficult to execute efficiently. MindSpore provides an
 encoding mode that unifies dynamic and static graphs, which greatly improves the compatibility between static and
 dynamic graphs. Instead of developing multiple sets of code, users can switch between the dynamic and static graph
 modes by changing only one line of code. For example, set context.set_context(mode=context.PYNATIVE_MODE) to switch to
 the dynamic graph mode, or set context.set_context(mode=context.GRAPH_MODE) to switch to the static graph mode, which
 facilitates development and debugging, and improves performance experience.
 A neural network model is usually trained based on gradient descent algorithm, but the manual derivation process is
 complex and the result is prone to errors. The automatic differentiation mechanism of MindSpore based on source code
 transformation (SCT) uses a functional differential programming architecture and provides Python APIs at the API layer,
 including the expression of control flows. Users can focus on the native mathematical expression of the model algorithm
 without manual derivation. The sample code for automatic differentiation is as follows:
 In the first step, a function (computational graph) is defined. In the second step, automatic differentiation is
 performed by using a backward API provided by MindSpore, and the first derivative function (computational graph) is
 defined. In the third step, the second derivative function (computational graph) is defined. After the input is given,
 the second derivative of the function defined in step 1 can be obtained at the specified position. The result of the
 second derivative is 12.
 In addition, the SCT can convert Python code into an intermediate representation (IR) of a MindSpore function. The IR
 constructs a computational graph that can be parsed and executed on different devices. Before the computational graph
 is executed, a plurality of software and hardware collaborative optimization technologies are used, and performance and
 efficiency in different scenarios such as device, edge, and cloud, are improved.
 Improving the data processing capability to match the computing power of AI chips is the key to ensure the ultimate
 performance of AI chips. MindSpore provides multiple data processing operators and uses automatic data acceleration
 technology to implement high-performance pipelines, including data loading, data demonstration, and data conversion. It
 supports data processing capabilities in all scenarios, such as CV, NLP, and GNN. MindRecord is a self-developed data
 format of MindSpore. It features efficient read and write and easy distributed processing. Users can convert
 non-standard and common datasets to the MindRecord format to obtain better performance experience. For details about
 the conversion, see MindSpore Data Format Conversion. MindSpore supports the loading of common datasets and datasets in
 multiple data storage formats. For example, users can use dataset=dataset.Cifar10Dataset("Cifar10Data/") to load the
 CIFAR-10 dataset. Cifar10Data/ indicates the local directory of the dataset, and users can also use GeneratorDataset to
 customize the dataset loading mode. Data augmentation is a method of generating new data based on (limited) data, which
 can reduce the overfitting phenomenon of network model and improve the generalization ability of the model. In addition
 to user-defined data augmentation, MindSpore provides automatic data augmentation, making data augmentation more
 flexible. For details, see Automatic Data Augmentation.
 The deep learning neural network model usually contains many hidden layers for feature extraction. However, the feature
 extraction is random and the debugging process is invisible, which limits the trustworthiness and optimization of the
 deep learning technology. MindSpore supports visualized debugging and optimization (MindInsight) and provides functions
 such as training dashboard, lineage, performance analysis, and debugger to help users detect deviations during model
 training and easily debug and optimize models. For example, before initializing the network, users can use
 profiler=Profiler() to initialize the Profiler object, automatically collect information such as the operator time
 consumption during training, and record the information in a file. After the training is complete, call
 profiler.analyse() to stop collecting data and generate performance analysis results. Users can view and analyze the
 visualized results to more efficiently debug network performance. For details about debugging and optimization, see
 Training Process Visualization.
 As a scale of neural network models and datasets continuously increases, parallel distributed training becomes a common
 practice of neural network training. However, policy selection and compilation of parallel distributed training are
 very complex, which severely restricts training efficiency of a deep learning model and hinders development of deep
 learning. MindSpore unifies the encoding methods of standalone and distributed training. Developers do not need to
 write complex distributed policies. Instead, they can implement distributed training by adding a small amount of codes
 to the standalone code. For example, after context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL)
 is set, a cost model can be automatically established, and a better parallel mode can be selected for users. This
 improves the training efficiency of neural networks, greatly decreases the AI development difficulty, and enables users
 to quickly implement model. For more information, see Distributed Training.
 Level Structure
 To support network building, entire graph execution, subgraph execution, and single-operator execution, MindSpore
 provides users with three levels of APIs which are Low-Level Python API, Medium-Level Python API, and High-Level Python
 API in ascending order.
 Low-Level Python API
 The first is low-level API, including tensor definition, basic operators, and automatic differential modules. Users can
 use the low-level API to easily define tensors and perform derivative calculation. For example, users can customize
 tensors by using the Tensor API, and use the GradOperation operator in the ops.composite module to calculate the
 derivative of the function at a specified position.
 Medium-Level Python API
 The second is medium-level API which encapsulates low-cost APIs and provides modules such as the network layer,
 optimizer, and loss function. Users can flexibly build neural networks and control execution processes through the
 medium-level API to quickly implement model algorithm logic. For example, users can call the Cell API to build neural
 network models and computing logic, add the loss function and optimization methods to the neural network model by using
 the loss module and Optimizer API, and use the dataset module to process data for model training and evaluation.
 High-Level Python API
 The third is high-level API. Based on the medium-level API, it provides advanced APIs such as training and inference
 management, mixed precision training, and debugging and optimization, facilitating users to control the execution
 process of the entire network and implement training, inference, and optimization of the neural network. For example,
 users can use the Model API, specify the neural network model to be trained and related training settings, train the
 neural network model, and debug the neural network performance through the Profiler API.
--- a/tests/ut/data/mindrecord/testAclImdbData/pos/10000_8.txt
+++ b/tests/ut/data/mindrecord/testAclImdbData/pos/10000_8.txt
@ -1 +1 @@
-Homelessness (or Houselessness as George Carlin stated) has been an issue for years but never a plan to help those on the street that were once considered human who did everything from going to school, work, or vote for the matter. Most people think of the homeless as just a lost cause while worrying about things such as racism, the war on Iraq, pressuring kids to succeed, technology, the elections, inflation, or worrying if they'll be next to end up on the streets.<br /><br />But what if you were given a bet to live on the streets for a month without the luxuries you once had from a home, the entertainment sets, a bathroom, pictures on the wall, a computer, and everything you once treasure to see what it's like to be homeless? That is Goddard Bolt's lesson.<br /><br />Mel Brooks (who directs) who stars as Bolt plays a rich man who has everything in the world until deciding to make a bet with a sissy rival (Jeffery Tambor) to see if he can live in the streets for thirty days without the luxuries; if Bolt succeeds, he can do what he wants with a future project of making more buildings. The bet's on where Bolt is thrown on the street with a bracelet on his leg to monitor his every move where he can't step off the sidewalk. He's given the nickname Pepto by a vagrant after it's written on his forehead where Bolt meets other characters including a woman by the name of Molly (Lesley Ann Warren) an ex-dancer who got divorce before losing her home, and her pals Sailor (Howard Morris) and Fumes (Teddy Wilson) who are already used to the streets. They're survivors. Bolt isn't. He's not used to reaching mutual agreements like he once did when being rich where it's fight or flight, kill or be killed.<br /><br />While the love connection between Molly and Bolt wasn't necessary to plot, I found "Life Stinks" to be one of Mel Brooks' observant films where prior to being a comedy, it shows a tender side compared to his slapstick work such as Blazing Saddles, Young Frankenstein, or Spaceballs for the matter, to show what it's like having something valuable before losing it the next day or on the other hand making a stupid bet like all rich people do when they don't know what to do with their money. Maybe they should give it to the homeless instead of using it like Monopoly money.<br /><br />Or maybe this film will inspire you to help others.
+MindSpore is a deep learning framework in all scenarios, aiming to achieve easy development, efficient execution, and all-scenario coverage.<br /><br />Easy development features include API friendliness and low debugging difficulty.<br /><br />Efficient execution includes computing efficiency, data preprocessing efficiency, and distributed training efficiency.<br /><br />All-scenario coverage means that the framework supports cloud, edge, and device scenarios.<br /><br />ME (MindExpression) provides user-level APIs for scientific computing, building and training neural networks, and converting Python code of users into graphs.<br /><br />For more information about the overall architecture, see Overall Architecture.
--- a/tests/ut/data/mindrecord/testAclImdbData/vocab.txt
+++ b/tests/ut/data/mindrecord/testAclImdbData/vocab.txt
--- a/tests/ut/python/dataset/test_sentencepiece_tokenizer.py
+++ b/tests/ut/python/dataset/test_sentencepiece_tokenizer.py
@ -18,60 +18,86 @@ import mindspore.dataset.text as text
 import mindspore.dataset as ds
 from mindspore.dataset.text import SentencePieceModel, to_str, SPieceTokenizerOutType
-VOCAB_FILE = "../data/dataset/test_sentencepiece/botchan.txt"
+VOCAB_FILE = "../data/dataset/test_sentencepiece/vocab.txt"
 DATA_FILE = "../data/dataset/testTokenizerData/sentencepiece_tokenizer.txt"
 def test_sentence_piece_tokenizer_callable():
-    vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {})
+    """
    Feature: SentencePieceTokenizer
    Description: test SentencePieceTokenizer with eager mode
    Expectation: output is equal to the expected value
    """
    vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 100, 0.9995, SentencePieceModel.UNIGRAM, {})
    tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
-    data = '123'
+    data = "123"
-    assert np.array_equal(tokenizer(data), ['▁', '12', '3'])
+    assert np.array_equal(tokenizer(data), ["▁", "1", "23"])
-def test_from_vocab_to_str_UNIGRAM():
+def test_from_vocab_to_str_unigram():
-    vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {})
+    """
    Feature: SentencePieceTokenizer
    Description: test SentencePieceTokenizer with UNIGRAM model
    Expectation: output is equal to the expected value
    """
    vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 100, 0.9995, SentencePieceModel.UNIGRAM, {})
    tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    dataset = dataset.map(operations=tokenizer)
-    expect = ['▁I', '▁sa', 'w', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'co', 'pe', '.']
+    expect = ["▁", "I", "▁use", "▁MindSpore", "▁", "to", "▁", "t", "r", "a", "i", "n", "▁", "m", "y", "▁model", "."]
    for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
        ret = to_str(i["text"])
        for key, value in enumerate(ret):
            assert value == expect[key]
-def test_from_vocab_to_str_BPE():
+def test_from_vocab_to_str_bpe():
-    vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.BPE, {})
+    """
    Feature: SentencePieceTokenizer
    Description: test SentencePieceTokenizer with BPE model
    Expectation: output is equal to the expected value
    """
    vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 100, 0.9995, SentencePieceModel.BPE, {})
    tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    dataset = dataset.map(operations=tokenizer)
-    expect = ['▁I', '▁saw', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'c', 'ope', '.']
+    expect = ["▁", "I", "▁", "u", "s", "e", "▁", "M", "in", "d", "S", "p", "or", "e", "▁t", "o", "▁t", "ra", "in", "▁m",
              "y", "▁m", "ode", "l", "."]
    for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
        ret = to_str(i["text"])
        for key, value in enumerate(ret):
            assert value == expect[key]
-def test_from_vocab_to_str_CHAR():
+def test_from_vocab_to_str_char():
-    vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.CHAR, {})
+    """
    Feature: SentencePieceTokenizer
    Description: test SentencePieceTokenizer with CHAR model
    Expectation: output is equal to the expected value
    """
    vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 100, 0.9995, SentencePieceModel.CHAR, {})
    tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    dataset = dataset.map(operations=tokenizer)
-    expect = ['▁', 'I', '▁', 's', 'a', 'w', '▁', 'a', '▁', 'g', 'i', 'r', 'l', '▁', 'w', 'i', 't', 'h',\
+    expect = ["▁", "I", "▁", "u", "s", "e", "▁", "M", "i", "n", "d", "S", "p", "o", "r", "e", "▁", "t", "o", "▁", "t",
-              '▁', 'a', '▁', 't', 'e', 'l', 'e', 's', 'c', 'o', 'p', 'e', '.']
+              "r", "a", "i", "n", "▁", "m", "y", "▁", "m", "o", "d", "e", "l", "."]
    for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
        ret = to_str(i["text"])
        for key, value in enumerate(ret):
            assert value == expect[key]
-def test_from_vocab_to_str_WORD():
+def test_from_vocab_to_str_word():
-    vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.WORD, {})
+    """
    Feature: SentencePieceTokenizer
    Description: test SentencePieceTokenizer with WORD model
    Expectation: output is equal to the expected value
    """
    vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 100, 0.9995, SentencePieceModel.WORD, {})
    tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    dataset = dataset.map(operations=tokenizer)
-    expect = ['▁I', '▁saw', '▁a', '▁girl', '▁with', '▁a', '▁telescope.']
+    expect = ["▁I", "▁use", "▁MindSpore", "▁to", "▁train▁my▁model."]
    for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
        ret = to_str(i["text"])
        for key, value in enumerate(ret):
@ -79,11 +105,16 @@ def test_from_vocab_to_str_WORD():
 def test_from_vocab_to_int():
-    vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {})
+    """
    Feature: SentencePieceTokenizer
    Description: test SentencePieceTokenizer with out_type equal to int
    Expectation: output is equal to the expected value
    """
    vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 100, 0.9995, SentencePieceModel.UNIGRAM, {})
    tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.INT)
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    dataset = dataset.map(operations=tokenizer)
-    expect = [6, 329, 183, 8, 945, 23, 8, 3783, 4382, 4641, 1405, 4]
+    expect = [3, 41, 59, 53, 3, 29, 3, 6, 12, 99, 7, 10, 3, 11, 20, 45, 19]
    for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
        ret = i["text"]
        for key, value in enumerate(ret):
@ -91,12 +122,17 @@ def test_from_vocab_to_int():
 def test_from_file_to_str():
-    vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {})
+    """
    Feature: SentencePieceTokenizer
    Description: test SentencePieceTokenizer with out_type equal to string
    Expectation: output is equal to the expected value
    """
    vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 100, 0.9995, SentencePieceModel.UNIGRAM, {})
    text.SentencePieceVocab.save_model(vocab, "./", "m.model")
    tokenizer = text.SentencePieceTokenizer("./m.model", out_type=SPieceTokenizerOutType.STRING)
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    dataset = dataset.map(operations=tokenizer)
-    expect = ['▁I', '▁sa', 'w', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'co', 'pe', '.']
+    expect = ["▁", "I", "▁use", "▁MindSpore", "▁", "to", "▁", "t", "r", "a", "i", "n", "▁", "m", "y", "▁model", "."]
    for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
        ret = to_str(i["text"])
        for key, value in enumerate(ret):
@ -104,12 +140,17 @@ def test_from_file_to_str():
 def test_from_file_to_int():
-    vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {})
+    """
    Feature: SentencePieceTokenizer
    Description: test SentencePieceTokenizer while loading vocab model from file
    Expectation: output is equal to the expected value
    """
    vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 100, 0.9995, SentencePieceModel.UNIGRAM, {})
    text.SentencePieceVocab.save_model(vocab, "./", "m.model")
    tokenizer = text.SentencePieceTokenizer("./m.model", out_type=SPieceTokenizerOutType.INT)
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    dataset = dataset.map(operations=tokenizer)
-    expect = [6, 329, 183, 8, 945, 23, 8, 3783, 4382, 4641, 1405, 4]
+    expect = [3, 41, 59, 53, 3, 29, 3, 6, 12, 99, 7, 10, 3, 11, 20, 45, 19]
    for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
        ret = i["text"]
        for key, value in enumerate(ret):
@ -117,12 +158,17 @@ def test_from_file_to_int():
 def test_build_from_dataset():
    """
    Feature: SentencePieceTokenizer
    Description: test SentencePieceTokenizer while loading vocab model from dataset
    Expectation: output is equal to the expected value
    """
    data = ds.TextFileDataset(VOCAB_FILE, shuffle=False)
-    vocab = text.SentencePieceVocab.from_dataset(data, ["text"], 5000, 0.9995, SentencePieceModel.UNIGRAM, {})
+    vocab = text.SentencePieceVocab.from_dataset(data, ["text"], 100, 0.9995, SentencePieceModel.UNIGRAM, {})
    tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    dataset = dataset.map(operations=tokenizer)
-    expect = ['▁I', '▁sa', 'w', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'co', 'pe', '.']
+    expect = ["▁", "I", "▁use", "▁MindSpore", "▁", "to", "▁", "t", "r", "a", "i", "n", "▁", "m", "y", "▁model", "."]
    for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
        ret = to_str(i["text"])
        for key, value in enumerate(ret):
@ -130,8 +176,8 @@ def test_build_from_dataset():
 def apply_func(dataset):
-    input_columns = ['text']
+    input_columns = ["text"]
-    output_columns = ['text2']
+    output_columns = ["text2"]
    dataset = dataset.rename(input_columns, output_columns)
    return dataset
@ -141,7 +187,7 @@ def zip_test(dataset):
    dataset_2 = copy.deepcopy(dataset)
    dataset_1 = dataset_1.apply(apply_func)
    dataset_zip = ds.zip((dataset_1, dataset_2))
-    expect = ['▁I', '▁sa', 'w', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'co', 'pe', '.']
+    expect = ["▁", "I", "▁use", "▁MindSpore", "▁", "to", "▁", "t", "r", "a", "i", "n", "▁", "m", "y", "▁model", "."]
    for i in dataset_zip.create_dict_iterator(num_epochs=1, output_numpy=True):
        ret = to_str(i["text"])
        for key, value in enumerate(ret):
@ -151,15 +197,21 @@ def zip_test(dataset):
 def concat_test(dataset):
    dataset_1 = copy.deepcopy(dataset)
    dataset = dataset.concat(dataset_1)
-    expect = ['▁I', '▁sa', 'w', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'co', 'pe', '.']
+    expect = ["▁", "I", "▁use", "▁MindSpore", "▁", "to", "▁", "t", "r", "a", "i", "n", "▁", "m", "y", "▁model", "."]
    for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
        ret = to_str(i["text"])
        for key, value in enumerate(ret):
            assert value == expect[key]
 def test_with_zip_concat():
    """
    Feature: SentencePieceTokenizer
    Description: test SentencePieceTokenizer with zip and concat operations
    Expectation: output is equal to the expected value
    """
    data = ds.TextFileDataset(VOCAB_FILE, shuffle=False)
-    vocab = text.SentencePieceVocab.from_dataset(data, ["text"], 5000, 0.9995, SentencePieceModel.UNIGRAM, {})
+    vocab = text.SentencePieceVocab.from_dataset(data, ["text"], 100, 0.9995, SentencePieceModel.UNIGRAM, {})
    tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    dataset = dataset.map(operations=tokenizer, num_parallel_workers=2)
@ -169,10 +221,10 @@ def test_with_zip_concat():
 if __name__ == "__main__":
    test_sentence_piece_tokenizer_callable()
-    test_from_vocab_to_str_UNIGRAM()
+    test_from_vocab_to_str_unigram()
-    test_from_vocab_to_str_BPE()
+    test_from_vocab_to_str_bpe()
-    test_from_vocab_to_str_CHAR()
+    test_from_vocab_to_str_char()
-    test_from_vocab_to_str_WORD()
+    test_from_vocab_to_str_word()
    test_from_vocab_to_int()
    test_from_file_to_str()
    test_from_file_to_int()
`@ -1 +1 @@`
	`I saw a girl with a telescope.`	`I use MindSpore to train my model.`
`@ -1 +1 @@`
	Homelessness (or Houselessness as George Carlin stated) has been an issue for years but never a plan to help those on the street that were once considered human who did everything from going to school, work, or vote for the matter. Most people think of the homeless as just a lost cause while worrying about things such as racism, the war on Iraq, pressuring kids to succeed, technology, the elections, inflation, or worrying if they'll be next to end up on the streets.<br /><br />But what if you were given a bet to live on the streets for a month without the luxuries you once had from a home, the entertainment sets, a bathroom, pictures on the wall, a computer, and everything you once treasure to see what it's like to be homeless? That is Goddard Bolt's lesson.<br /><br />Mel Brooks (who directs) who stars as Bolt plays a rich man who has everything in the world until deciding to make a bet with a sissy rival (Jeffery Tambor) to see if he can live in the streets for thirty days without the luxuries; if Bolt succeeds, he can do what he wants with a future project of making more buildings. The bet's on where Bolt is thrown on the street with a bracelet on his leg to monitor his every move where he can't step off the sidewalk. He's given the nickname Pepto by a vagrant after it's written on his forehead where Bolt meets other characters including a woman by the name of Molly (Lesley Ann Warren) an ex-dancer who got divorce before losing her home, and her pals Sailor (Howard Morris) and Fumes (Teddy Wilson) who are already used to the streets. They're survivors. Bolt isn't. He's not used to reaching mutual agreements like he once did when being rich where it's fight or flight, kill or be killed.<br /><br />While the love connection between Molly and Bolt wasn't necessary to plot, I found "Life Stinks" to be one of Mel Brooks' observant films where prior to being a comedy, it shows a tender side compared to his slapstick work such as Blazing Saddles, Young Frankenstein, or Spaceballs for the matter, to show what it's like having something valuable before losing it the next day or on the other hand making a stupid bet like all rich people do when they don't know what to do with their money. Maybe they should give it to the homeless instead of using it like Monopoly money.<br /><br />Or maybe this film will inspire you to help others.	MindSpore is a deep learning framework in all scenarios, aiming to achieve easy development, efficient execution, and all-scenario coverage.<br /><br />Easy development features include API friendliness and low debugging difficulty.<br /><br />Efficient execution includes computing efficiency, data preprocessing efficiency, and distributed training efficiency.<br /><br />All-scenario coverage means that the framework supports cloud, edge, and device scenarios.<br /><br />ME (MindExpression) provides user-level APIs for scientific computing, building and training neural networks, and converting Python code of users into graphs.<br /><br />For more information about the overall architecture, see Overall Architecture.