DuplicateOp

2020-06-18 14:49:21 -04:00 · 2020-06-18 14:49:21 -04:00 · 11826fb256
parent 90bb9320aa
commit 11826fb256
11 changed files with 222 additions and 16 deletions
--- a/mindspore/ccsrc/dataset/api/python_bindings.cc
+++ b/mindspore/ccsrc/dataset/api/python_bindings.cc
@ -40,6 +40,7 @@
 #include "dataset/kernels/image/resize_op.h"
 #include "dataset/kernels/image/uniform_aug_op.h"
 #include "dataset/kernels/image/bounding_box_augment_op.h"
+#include "dataset/kernels/data/duplicate_op.h"
 #include "dataset/kernels/data/fill_op.h"
 #include "dataset/kernels/data/mask_op.h"
 #include "dataset/kernels/data/pad_end_op.h"
@ -443,6 +444,9 @@ void bindTensorOps2(py::module *m) {
                                                              "Tensor mask operation using relational comparator")
    .def(py::init<RelationalOp, std::shared_ptr<Tensor>, DataType>());

+  (void)py::class_<DuplicateOp, TensorOp, std::shared_ptr<DuplicateOp>>(*m, "DuplicateOp", "Duplicate tensor.")
+    .def(py::init<>());
+
  (void)py::class_<TruncateSequencePairOp, TensorOp, std::shared_ptr<TruncateSequencePairOp>>(
    *m, "TruncateSequencePairOp", "Tensor operation to truncate two tensors to a max_length")
    .def(py::init<int64_t>());
--- a/mindspore/ccsrc/dataset/core/tensor.h
+++ b/mindspore/ccsrc/dataset/core/tensor.h
@ -115,6 +115,16 @@ class Tensor {
  static Status CreateTensor(std::shared_ptr<Tensor> *, TensorImpl tensor_impl, const TensorShape &shape, DataType type,
                             const unsigned char *data = nullptr);

+  /// Create a copy of the input tensor
+  /// \param out [out] output tensor to be generated
+  /// \param in [in] orginal tensor to be copied
+  /// \return Status
+  static Status CreateTensor(std::shared_ptr<Tensor> *out, const std::shared_ptr<Tensor> &in) {
+    const TensorAlloc *alloc = GlobalContext::Instance()->tensor_allocator();
+    *out = std::allocate_shared<Tensor>(*alloc, in->shape(), in->type(), in->GetBuffer(), in->SizeInBytes());
+    return Status::OK();
+  }
+
  // A static factory method to create a Tensor from a given py::array.
  // @param ptr output argument to hold the created Tensor
  // @param arr py::array
--- a/mindspore/ccsrc/dataset/kernels/data/CMakeLists.txt
+++ b/mindspore/ccsrc/dataset/kernels/data/CMakeLists.txt
@ -10,4 +10,5 @@ add_library(kernels-data OBJECT
        slice_op.cc
        mask_op.cc
        concatenate_op.cc
+        duplicate_op.cc
        )
--- a/mindspore/ccsrc/dataset/kernels/data/duplicate_op.cc
+++ b/mindspore/ccsrc/dataset/kernels/data/duplicate_op.cc
@ -0,0 +1,35 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dataset/kernels/data/duplicate_op.h"
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+
+namespace mindspore {
+namespace dataset {
+
+Status DuplicateOp::Compute(const TensorRow &input, TensorRow *output) {
+  IO_CHECK_VECTOR(input, output);
+  CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "Input should be one tensor");
+  std::shared_ptr<Tensor> out;
+  RETURN_IF_NOT_OK(Tensor::CreateTensor(&out, input[0]));
+  output->push_back(input[0]);
+  output->push_back(out);
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/dataset/kernels/data/duplicate_op.h
+++ b/mindspore/ccsrc/dataset/kernels/data/duplicate_op.h
@ -0,0 +1,42 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_KERNELS_DATA_DUPLICATE_OP_H_
+#define DATASET_KERNELS_DATA_DUPLICATE_OP_H_
+
+#include <vector>
+#include <memory>
+
+#include "dataset/core/tensor.h"
+#include "dataset/kernels/tensor_op.h"
+
+namespace mindspore {
+namespace dataset {
+
+class DuplicateOp : public TensorOp {
+ public:
+  DuplicateOp() = default;
+
+  ~DuplicateOp() override = default;
+
+  void Print(std::ostream &out) const override { out << "DuplicateOp"; }
+
+  Status Compute(const TensorRow &input, TensorRow *output) override;
+
+  uint32_t NumOutput() override { return 2; }
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_KERNELS_DUPLICATE_OP_H_
--- a/mindspore/dataset/engine/datasets.py
+++ b/mindspore/dataset/engine/datasets.py
@ -4869,10 +4869,10 @@ class BuildVocabDataset(DatasetOp):
        top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are
            taken. The top_k is taken after freq_range. If not enough top_k, all words will be taken (default=None,
            all words are included).
-        special_tokens(list):  a list of strings, each one is a special token. for e.g. ["<pad>","<unk>"]
-            (default=None, no special tokens will be added).
-        special_first(bool): whether special_tokens will be prepended/appended to vocab, If special_tokens is
-            specified and special_first is set to None, special_tokens will be prepended. (default=None).
+        special_tokens(list, optional):  a list of strings, each one is a special token. for example
+            special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
+        special_first(bool, optional): whether special_tokens will be prepended/appended to vocab, If special_tokens
+            is specified and special_first is set to None, special_tokens will be prepended. (default=None).
        prefetch_size (int, optional): prefetch number of records ahead of the user's request (default=None).
    """

--- a/mindspore/dataset/text/utils.py
+++ b/mindspore/dataset/text/utils.py
@ -50,8 +50,8 @@ class Vocab(cde.Vocab):
            top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are
                taken. top_k is taken after freq_range. If not enough top_k, all words will be taken. (default=None
                all words are included).
-            special_tokens(list):  a list of strings, each one is a special token. for e.g. ["<pad>","<unk>"]
-                (default=None, no special tokens will be added).
+            special_tokens(list, optional):  a list of strings, each one is a special token. for example
+                special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
            special_first(bool, optional): whether special_tokens will be prepended/appended to vocab. If special_tokens
                is specified and special_first is set to None, special_tokens will be prepended. (default=None).
        return:
@ -72,8 +72,8 @@ class Vocab(cde.Vocab):
            build a vocab object from a list of word.
        Args:
            word_list(list): a list of string where each element is a word of type string.
-            special_tokens(list):  a list of strings, each one is a special token. for e.g. ["<pad>","<unk>"]
-                (default=None, no special tokens will be added).
+            special_tokens(list, optional):  a list of strings, each one is a special token. for example
+                special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
            special_first(bool, optional): whether special_tokens will be prepended/appended to vocab, If special_tokens
                is specified and special_first is set to None, special_tokens will be prepended. (default=None).
        """
@ -89,8 +89,8 @@ class Vocab(cde.Vocab):
            delimiter(str, optional): a delimiter to break up each line in file, the first element is taken to be
                the word (default=None).
            vocab_size(int, optional): number of words to read from file_path (default=None, all words are taken).
-            special_tokens(list):  a list of strings, each one is a special token. for e.g. ["<pad>","<unk>"]
-                (default=None, no special tokens will be added).
+            special_tokens(list, optional):  a list of strings, each one is a special token. for example
+                special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
            special_first(bool, optional): whether special_tokens will be prepended/appended to vocab, If special_tokens
                is specified and special_first is set to None, special_tokens will be prepended. (default=None).
        """
--- a/mindspore/dataset/transforms/c_transforms.py
+++ b/mindspore/dataset/transforms/c_transforms.py
@ -203,3 +203,22 @@ class Concatenate(cde.ConcatenateOp):
    def __init__(self, axis=0, prepend=None, append=None):
        # add some validations here later
        super().__init__(axis, prepend, append)
+
+
+class Duplicate(cde.DuplicateOp):
+    """
+    Duplicate the input tensor to a new output tensor. The input tensor is carried over to the output list.
+        Examples:
+        >>> # Data before
+        >>> # |  x      |
+        >>> # +---------+
+        >>> # | [1,2,3] |
+        >>> # +---------+
+        >>> data = data.map(input_columns=["x"], operations=Duplicate(),
+        >>>         output_columns=["x", "y"], output_order=["x", "y"])
+        >>> # Data after
+        >>> # |  x      |  y      |
+        >>> # +---------+---------+
+        >>> # | [1,2,3] | [1,2,3] |
+        >>> # +---------+---------+
+    """
--- a/tests/ut/cpp/dataset/duplicate_op_test.cc
+++ b/tests/ut/cpp/dataset/duplicate_op_test.cc
@ -0,0 +1,49 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "dataset/core/client.h"
+#include "common/common.h"
+#include "gtest/gtest.h"
+#include "dataset/core/tensor.h"
+#include "dataset/util/de_error.h"
+#include "dataset/kernels/data/duplicate_op.h"
+
+using namespace mindspore::dataset;
+
+namespace py = pybind11;
+
+class MindDataTestDuplicateOp : public UT::Common {
+ public:
+  MindDataTestDuplicateOp() {}
+
+  void SetUp() { GlobalInit(); }
+};
+
+TEST_F(MindDataTestDuplicateOp, Basics) {
+  std::shared_ptr<Tensor> t;
+  Tensor::CreateTensor(&t, std::vector<uint32_t>({1, 2, 3, 4, 5, 6}));
+  std::shared_ptr<Tensor> v;
+  Tensor::CreateTensor(&v, std::vector<uint32_t>({3}), TensorShape::CreateScalar());
+  std::shared_ptr<DuplicateOp> op = std::make_shared<DuplicateOp>();
+  TensorRow in;
+  in.push_back(t);
+  TensorRow out;
+  ASSERT_TRUE(op->Compute(in, &out).IsOk());
+
+  ASSERT_TRUE(*t == *out[0]);
+  ASSERT_TRUE(*t == *out[1]);
+  ASSERT_TRUE(t->GetBuffer() == out[0]->GetBuffer());
+  ASSERT_TRUE(t->GetBuffer() != out[1]->GetBuffer());
+}
--- a/tests/ut/python/dataset/test_duplicate_op.py
+++ b/tests/ut/python/dataset/test_duplicate_op.py
@ -0,0 +1,40 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Testing Duplicate op in DE
+"""
+import numpy as np
+
+import mindspore.dataset as ds
+import mindspore.dataset.transforms.c_transforms as ops
+
+
+def compare(array):
+    data = ds.NumpySlicesDataset([array], column_names="x")
+    array = np.array(array)
+    data = data.map(input_columns=["x"], output_columns=["x", "y"], columns_order=["x", "y"],
+                    operations=ops.Duplicate())
+    for d in data.create_dict_iterator():
+        np.testing.assert_array_equal(array, d["x"])
+        np.testing.assert_array_equal(array, d["y"])
+
+
+def test_duplicate_basics():
+    compare([1, 2, 3])
+    compare([b"1", b"2", b"3"])
+
+
+if __name__ == "__main__":
+    test_duplicate_basics()
--- a/tests/ut/python/dataset/test_vocab.py
+++ b/tests/ut/python/dataset/test_vocab.py
@ -1,4 +1,4 @@
-# Copyright 2019 Huawei Technologies Co., Ltd
+# Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -94,9 +94,10 @@ def test_from_file():
        for word in texts.split(" "):
            yield (np.array(word, dtype='S'),)

-    def test_config(lookup_str, special_tokens, special_first):
+    def test_config(lookup_str, vocab_size, special_tokens, special_first):
        try:
-            vocab = text.Vocab.from_file(SIMPLE_VOCAB_FILE, special_tokens=special_tokens, special_first=special_first)
+            vocab = text.Vocab.from_file(SIMPLE_VOCAB_FILE, vocab_size=vocab_size, special_tokens=special_tokens,
+                                         special_first=special_first)
            data = ds.GeneratorDataset(gen(lookup_str), column_names=["text"])
            data = data.map(input_columns=["text"], operations=text.Lookup(vocab))
            res = []
@ -106,9 +107,14 @@ def test_from_file():
        except ValueError as e:
            return str(e)

-    assert test_config("w1 w2 w3", ["s1", "s2", "s3"], True) == [3, 4, 5]
-    assert test_config("w1 w2 w3", ["s1", "s2", "s3"], False) == [0, 1, 2]
-    assert "special_tokens contains duplicate" in test_config("w1", ["s1", "s1"], True)
+    # test special tokens are prepended
+    assert test_config("w1 w2 w3 s1 s2 s3", None, ["s1", "s2", "s3"], True) == [3, 4, 5, 0, 1, 2]
+    # test special tokens are appended
+    assert test_config("w1 w2 w3 s1 s2 s3", None, ["s1", "s2", "s3"], False) == [0, 1, 2, 8, 9, 10]
+    # test special tokens are prepended when not all words in file are used
+    assert test_config("w1 w2 w3 s1 s2 s3", 3, ["s1", "s2", "s3"], False) == [0, 1, 2, 3, 4, 5]
+    # text exception special_words contains duplicate words
+    assert "special_tokens contains duplicate" in test_config("w1", None, ["s1", "s1"], True)


 if __name__ == '__main__':