forked from mindspore-Ecosystem/mindspore
DuplicateOp
This commit is contained in:
parent
90bb9320aa
commit
11826fb256
|
@ -40,6 +40,7 @@
|
|||
#include "dataset/kernels/image/resize_op.h"
|
||||
#include "dataset/kernels/image/uniform_aug_op.h"
|
||||
#include "dataset/kernels/image/bounding_box_augment_op.h"
|
||||
#include "dataset/kernels/data/duplicate_op.h"
|
||||
#include "dataset/kernels/data/fill_op.h"
|
||||
#include "dataset/kernels/data/mask_op.h"
|
||||
#include "dataset/kernels/data/pad_end_op.h"
|
||||
|
@ -443,6 +444,9 @@ void bindTensorOps2(py::module *m) {
|
|||
"Tensor mask operation using relational comparator")
|
||||
.def(py::init<RelationalOp, std::shared_ptr<Tensor>, DataType>());
|
||||
|
||||
(void)py::class_<DuplicateOp, TensorOp, std::shared_ptr<DuplicateOp>>(*m, "DuplicateOp", "Duplicate tensor.")
|
||||
.def(py::init<>());
|
||||
|
||||
(void)py::class_<TruncateSequencePairOp, TensorOp, std::shared_ptr<TruncateSequencePairOp>>(
|
||||
*m, "TruncateSequencePairOp", "Tensor operation to truncate two tensors to a max_length")
|
||||
.def(py::init<int64_t>());
|
||||
|
|
|
@ -115,6 +115,16 @@ class Tensor {
|
|||
static Status CreateTensor(std::shared_ptr<Tensor> *, TensorImpl tensor_impl, const TensorShape &shape, DataType type,
|
||||
const unsigned char *data = nullptr);
|
||||
|
||||
/// Create a copy of the input tensor
|
||||
/// \param out [out] output tensor to be generated
|
||||
/// \param in [in] orginal tensor to be copied
|
||||
/// \return Status
|
||||
static Status CreateTensor(std::shared_ptr<Tensor> *out, const std::shared_ptr<Tensor> &in) {
|
||||
const TensorAlloc *alloc = GlobalContext::Instance()->tensor_allocator();
|
||||
*out = std::allocate_shared<Tensor>(*alloc, in->shape(), in->type(), in->GetBuffer(), in->SizeInBytes());
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// A static factory method to create a Tensor from a given py::array.
|
||||
// @param ptr output argument to hold the created Tensor
|
||||
// @param arr py::array
|
||||
|
|
|
@ -10,4 +10,5 @@ add_library(kernels-data OBJECT
|
|||
slice_op.cc
|
||||
mask_op.cc
|
||||
concatenate_op.cc
|
||||
duplicate_op.cc
|
||||
)
|
||||
|
|
|
@ -0,0 +1,35 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "dataset/kernels/data/duplicate_op.h"
|
||||
|
||||
#include "dataset/core/tensor.h"
|
||||
#include "dataset/kernels/tensor_op.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
Status DuplicateOp::Compute(const TensorRow &input, TensorRow *output) {
|
||||
IO_CHECK_VECTOR(input, output);
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "Input should be one tensor");
|
||||
std::shared_ptr<Tensor> out;
|
||||
RETURN_IF_NOT_OK(Tensor::CreateTensor(&out, input[0]));
|
||||
output->push_back(input[0]);
|
||||
output->push_back(out);
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,42 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef DATASET_KERNELS_DATA_DUPLICATE_OP_H_
|
||||
#define DATASET_KERNELS_DATA_DUPLICATE_OP_H_
|
||||
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
|
||||
#include "dataset/core/tensor.h"
|
||||
#include "dataset/kernels/tensor_op.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
class DuplicateOp : public TensorOp {
|
||||
public:
|
||||
DuplicateOp() = default;
|
||||
|
||||
~DuplicateOp() override = default;
|
||||
|
||||
void Print(std::ostream &out) const override { out << "DuplicateOp"; }
|
||||
|
||||
Status Compute(const TensorRow &input, TensorRow *output) override;
|
||||
|
||||
uint32_t NumOutput() override { return 2; }
|
||||
};
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // DATASET_KERNELS_DUPLICATE_OP_H_
|
|
@ -4869,10 +4869,10 @@ class BuildVocabDataset(DatasetOp):
|
|||
top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are
|
||||
taken. The top_k is taken after freq_range. If not enough top_k, all words will be taken (default=None,
|
||||
all words are included).
|
||||
special_tokens(list): a list of strings, each one is a special token. for e.g. ["<pad>","<unk>"]
|
||||
(default=None, no special tokens will be added).
|
||||
special_first(bool): whether special_tokens will be prepended/appended to vocab, If special_tokens is
|
||||
specified and special_first is set to None, special_tokens will be prepended. (default=None).
|
||||
special_tokens(list, optional): a list of strings, each one is a special token. for example
|
||||
special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
|
||||
special_first(bool, optional): whether special_tokens will be prepended/appended to vocab, If special_tokens
|
||||
is specified and special_first is set to None, special_tokens will be prepended. (default=None).
|
||||
prefetch_size (int, optional): prefetch number of records ahead of the user's request (default=None).
|
||||
"""
|
||||
|
||||
|
|
|
@ -50,8 +50,8 @@ class Vocab(cde.Vocab):
|
|||
top_k(int, optional): top_k > 0. Number of words to be built into vocab. top_k most frequent words are
|
||||
taken. top_k is taken after freq_range. If not enough top_k, all words will be taken. (default=None
|
||||
all words are included).
|
||||
special_tokens(list): a list of strings, each one is a special token. for e.g. ["<pad>","<unk>"]
|
||||
(default=None, no special tokens will be added).
|
||||
special_tokens(list, optional): a list of strings, each one is a special token. for example
|
||||
special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
|
||||
special_first(bool, optional): whether special_tokens will be prepended/appended to vocab. If special_tokens
|
||||
is specified and special_first is set to None, special_tokens will be prepended. (default=None).
|
||||
return:
|
||||
|
@ -72,8 +72,8 @@ class Vocab(cde.Vocab):
|
|||
build a vocab object from a list of word.
|
||||
Args:
|
||||
word_list(list): a list of string where each element is a word of type string.
|
||||
special_tokens(list): a list of strings, each one is a special token. for e.g. ["<pad>","<unk>"]
|
||||
(default=None, no special tokens will be added).
|
||||
special_tokens(list, optional): a list of strings, each one is a special token. for example
|
||||
special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
|
||||
special_first(bool, optional): whether special_tokens will be prepended/appended to vocab, If special_tokens
|
||||
is specified and special_first is set to None, special_tokens will be prepended. (default=None).
|
||||
"""
|
||||
|
@ -89,8 +89,8 @@ class Vocab(cde.Vocab):
|
|||
delimiter(str, optional): a delimiter to break up each line in file, the first element is taken to be
|
||||
the word (default=None).
|
||||
vocab_size(int, optional): number of words to read from file_path (default=None, all words are taken).
|
||||
special_tokens(list): a list of strings, each one is a special token. for e.g. ["<pad>","<unk>"]
|
||||
(default=None, no special tokens will be added).
|
||||
special_tokens(list, optional): a list of strings, each one is a special token. for example
|
||||
special_tokens=["<pad>","<unk>"] (default=None, no special tokens will be added).
|
||||
special_first(bool, optional): whether special_tokens will be prepended/appended to vocab, If special_tokens
|
||||
is specified and special_first is set to None, special_tokens will be prepended. (default=None).
|
||||
"""
|
||||
|
|
|
@ -203,3 +203,22 @@ class Concatenate(cde.ConcatenateOp):
|
|||
def __init__(self, axis=0, prepend=None, append=None):
|
||||
# add some validations here later
|
||||
super().__init__(axis, prepend, append)
|
||||
|
||||
|
||||
class Duplicate(cde.DuplicateOp):
|
||||
"""
|
||||
Duplicate the input tensor to a new output tensor. The input tensor is carried over to the output list.
|
||||
Examples:
|
||||
>>> # Data before
|
||||
>>> # | x |
|
||||
>>> # +---------+
|
||||
>>> # | [1,2,3] |
|
||||
>>> # +---------+
|
||||
>>> data = data.map(input_columns=["x"], operations=Duplicate(),
|
||||
>>> output_columns=["x", "y"], output_order=["x", "y"])
|
||||
>>> # Data after
|
||||
>>> # | x | y |
|
||||
>>> # +---------+---------+
|
||||
>>> # | [1,2,3] | [1,2,3] |
|
||||
>>> # +---------+---------+
|
||||
"""
|
||||
|
|
|
@ -0,0 +1,49 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "dataset/core/client.h"
|
||||
#include "common/common.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "dataset/core/tensor.h"
|
||||
#include "dataset/util/de_error.h"
|
||||
#include "dataset/kernels/data/duplicate_op.h"
|
||||
|
||||
using namespace mindspore::dataset;
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
class MindDataTestDuplicateOp : public UT::Common {
|
||||
public:
|
||||
MindDataTestDuplicateOp() {}
|
||||
|
||||
void SetUp() { GlobalInit(); }
|
||||
};
|
||||
|
||||
TEST_F(MindDataTestDuplicateOp, Basics) {
|
||||
std::shared_ptr<Tensor> t;
|
||||
Tensor::CreateTensor(&t, std::vector<uint32_t>({1, 2, 3, 4, 5, 6}));
|
||||
std::shared_ptr<Tensor> v;
|
||||
Tensor::CreateTensor(&v, std::vector<uint32_t>({3}), TensorShape::CreateScalar());
|
||||
std::shared_ptr<DuplicateOp> op = std::make_shared<DuplicateOp>();
|
||||
TensorRow in;
|
||||
in.push_back(t);
|
||||
TensorRow out;
|
||||
ASSERT_TRUE(op->Compute(in, &out).IsOk());
|
||||
|
||||
ASSERT_TRUE(*t == *out[0]);
|
||||
ASSERT_TRUE(*t == *out[1]);
|
||||
ASSERT_TRUE(t->GetBuffer() == out[0]->GetBuffer());
|
||||
ASSERT_TRUE(t->GetBuffer() != out[1]->GetBuffer());
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""
|
||||
Testing Duplicate op in DE
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.transforms.c_transforms as ops
|
||||
|
||||
|
||||
def compare(array):
|
||||
data = ds.NumpySlicesDataset([array], column_names="x")
|
||||
array = np.array(array)
|
||||
data = data.map(input_columns=["x"], output_columns=["x", "y"], columns_order=["x", "y"],
|
||||
operations=ops.Duplicate())
|
||||
for d in data.create_dict_iterator():
|
||||
np.testing.assert_array_equal(array, d["x"])
|
||||
np.testing.assert_array_equal(array, d["y"])
|
||||
|
||||
|
||||
def test_duplicate_basics():
|
||||
compare([1, 2, 3])
|
||||
compare([b"1", b"2", b"3"])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_duplicate_basics()
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright 2019 Huawei Technologies Co., Ltd
|
||||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
|
@ -94,9 +94,10 @@ def test_from_file():
|
|||
for word in texts.split(" "):
|
||||
yield (np.array(word, dtype='S'),)
|
||||
|
||||
def test_config(lookup_str, special_tokens, special_first):
|
||||
def test_config(lookup_str, vocab_size, special_tokens, special_first):
|
||||
try:
|
||||
vocab = text.Vocab.from_file(SIMPLE_VOCAB_FILE, special_tokens=special_tokens, special_first=special_first)
|
||||
vocab = text.Vocab.from_file(SIMPLE_VOCAB_FILE, vocab_size=vocab_size, special_tokens=special_tokens,
|
||||
special_first=special_first)
|
||||
data = ds.GeneratorDataset(gen(lookup_str), column_names=["text"])
|
||||
data = data.map(input_columns=["text"], operations=text.Lookup(vocab))
|
||||
res = []
|
||||
|
@ -106,9 +107,14 @@ def test_from_file():
|
|||
except ValueError as e:
|
||||
return str(e)
|
||||
|
||||
assert test_config("w1 w2 w3", ["s1", "s2", "s3"], True) == [3, 4, 5]
|
||||
assert test_config("w1 w2 w3", ["s1", "s2", "s3"], False) == [0, 1, 2]
|
||||
assert "special_tokens contains duplicate" in test_config("w1", ["s1", "s1"], True)
|
||||
# test special tokens are prepended
|
||||
assert test_config("w1 w2 w3 s1 s2 s3", None, ["s1", "s2", "s3"], True) == [3, 4, 5, 0, 1, 2]
|
||||
# test special tokens are appended
|
||||
assert test_config("w1 w2 w3 s1 s2 s3", None, ["s1", "s2", "s3"], False) == [0, 1, 2, 8, 9, 10]
|
||||
# test special tokens are prepended when not all words in file are used
|
||||
assert test_config("w1 w2 w3 s1 s2 s3", 3, ["s1", "s2", "s3"], False) == [0, 1, 2, 3, 4, 5]
|
||||
# text exception special_words contains duplicate words
|
||||
assert "special_tokens contains duplicate" in test_config("w1", None, ["s1", "s1"], True)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
Loading…
Reference in New Issue