forked from mindspore-Ecosystem/mindspore
optimize cpu unique
This commit is contained in:
parent
7d250f2218
commit
618c05d454
|
@ -19,45 +19,67 @@
|
|||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
const size_t kUseBucketUniqueSize = 100000;
|
||||
const size_t kUniqueThreadNum = 23;
|
||||
void UniqueCPUKernel::InitKernel(const CNodePtr &kernel_node) {
|
||||
CheckParam(kernel_node);
|
||||
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
n_ = input_shape[0];
|
||||
input_size_ = input_shape[0];
|
||||
dtype_ = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0);
|
||||
}
|
||||
|
||||
void UniqueCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
|
||||
CPUKernel::InitInputOutputSize(kernel_node);
|
||||
workspace_size_list_.emplace_back(input_size_ * sizeof(int64_t));
|
||||
workspace_size_list_.emplace_back(input_size_ * sizeof(int64_t));
|
||||
workspace_size_list_.emplace_back(input_size_ * sizeof(int64_t));
|
||||
}
|
||||
|
||||
bool UniqueCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
|
||||
const std::vector<kernel::AddressPtr> & /*workspace*/,
|
||||
const std::vector<kernel::AddressPtr> &workspace,
|
||||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
if (dtype_ == kNumberTypeInt32) {
|
||||
LaunchKernel<int>(inputs, outputs);
|
||||
} else if (dtype_ == kNumberTypeFloat32) {
|
||||
LaunchKernel<float>(inputs, outputs);
|
||||
LaunchKernel<int, int>(inputs, workspace, outputs);
|
||||
} else if (dtype_ == kNumberTypeInt64) {
|
||||
LaunchKernel<int64_t>(inputs, outputs);
|
||||
LaunchKernel<int64_t, int>(inputs, workspace, outputs);
|
||||
} else if (dtype_ == kNumberTypeFloat32) {
|
||||
LaunchKernel<float, int>(inputs, workspace, outputs);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void UniqueCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
|
||||
auto x_addr = reinterpret_cast<T *>(inputs[0]->addr);
|
||||
auto y_addr = reinterpret_cast<T *>(outputs[0]->addr);
|
||||
auto idx_addr = reinterpret_cast<int64_t *>(outputs[1]->addr);
|
||||
|
||||
std::unordered_map<T, int64_t> uniq;
|
||||
int n = SizeToInt(n_);
|
||||
uniq.reserve(n * 2);
|
||||
for (int i = 0, j = 0; i < n; ++i) {
|
||||
auto it = uniq.emplace(x_addr[i], j);
|
||||
idx_addr[i] = it.first->second;
|
||||
if (it.second) {
|
||||
++j;
|
||||
}
|
||||
template <typename DataType, typename IndexType>
|
||||
void UniqueCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) {
|
||||
if (input_size_ == 0) {
|
||||
return;
|
||||
}
|
||||
for (const auto &it : uniq) {
|
||||
y_addr[it.second] = it.first;
|
||||
if (inputs.size() < 1) {
|
||||
MS_LOG(EXCEPTION) << "Input size should be large than 0";
|
||||
}
|
||||
if (workspace.size() < 3) {
|
||||
MS_LOG(EXCEPTION) << "workspace size should be large than 2";
|
||||
}
|
||||
if (outputs.size() < 2) {
|
||||
MS_LOG(EXCEPTION) << "Output size should be large than 1";
|
||||
}
|
||||
auto params = std::make_shared<UniqueParam<DataType, IndexType>>();
|
||||
params->input_ = reinterpret_cast<DataType *>(inputs[0]->addr);
|
||||
params->input_idx_ = reinterpret_cast<IndexType *>(workspace[0]->addr);
|
||||
params->workspace_ = reinterpret_cast<DataType *>(workspace[1]->addr);
|
||||
params->workspace_idx_ = reinterpret_cast<IndexType *>(workspace[2]->addr);
|
||||
params->output_ = reinterpret_cast<DataType *>(outputs[0]->addr);
|
||||
params->inverse_idx_ = reinterpret_cast<IndexType *>(outputs[1]->addr);
|
||||
params->input_size_ = input_size_;
|
||||
params->output_size_ = 0;
|
||||
params->need_sort_ = true;
|
||||
params->thread_num_ = kUniqueThreadNum;
|
||||
if (input_size_ < kUseBucketUniqueSize) {
|
||||
Unique(params);
|
||||
} else {
|
||||
BucketUnique(params);
|
||||
}
|
||||
output_size_ = params->output_size_;
|
||||
}
|
||||
|
||||
void UniqueCPUKernel::CheckParam(const CNodePtr &kernel_node) {
|
||||
|
|
|
@ -16,31 +16,339 @@
|
|||
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_UNIQUE_CPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_UNIQUE_CPU_KERNEL_H_
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <thread>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
template <typename DataType, typename IndexType>
|
||||
struct UniqueParam {
|
||||
DataType *input_{nullptr};
|
||||
IndexType *input_idx_{nullptr};
|
||||
DataType *output_{nullptr};
|
||||
IndexType *inverse_idx_{nullptr};
|
||||
DataType *workspace_{nullptr};
|
||||
IndexType *workspace_idx_{nullptr};
|
||||
IndexType input_size_{0};
|
||||
IndexType output_size_{0};
|
||||
size_t thread_num_{0};
|
||||
bool need_sort_{true};
|
||||
};
|
||||
|
||||
class UniqueCPUKernel : public CPUKernel {
|
||||
public:
|
||||
UniqueCPUKernel() = default;
|
||||
~UniqueCPUKernel() override = default;
|
||||
|
||||
void InitKernel(const CNodePtr &kernel_node) override;
|
||||
|
||||
void InitInputOutputSize(const CNodePtr &kernel_node) override;
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) override;
|
||||
|
||||
template <typename T>
|
||||
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
|
||||
template <typename DataType, typename IndexType>
|
||||
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs);
|
||||
|
||||
private:
|
||||
void CheckParam(const CNodePtr &kernel_node);
|
||||
size_t n_{0};
|
||||
protected:
|
||||
virtual void CheckParam(const CNodePtr &kernel_node);
|
||||
size_t input_size_{0};
|
||||
TypeId dtype_{kTypeUnknown};
|
||||
size_t output_size_{0};
|
||||
|
||||
template <typename DataType>
|
||||
static size_t BucketId(DataType data, size_t bucket_num) {
|
||||
return static_cast<size_t>(data) % bucket_num;
|
||||
}
|
||||
|
||||
template <typename DataType, typename IndexType>
|
||||
static void CalculateEachBucketSize(const std::shared_ptr<UniqueParam<DataType, IndexType>> ¶ms,
|
||||
std::vector<IndexType> *each_bucket_size) {
|
||||
MS_EXCEPTION_IF_NULL(params);
|
||||
MS_EXCEPTION_IF_NULL(params->input_);
|
||||
MS_EXCEPTION_IF_NULL(each_bucket_size);
|
||||
size_t bucket_num = each_bucket_size->size();
|
||||
for (IndexType i = 0; i < params->input_size_; ++i) {
|
||||
auto bucket_id = BucketId(params->input_[i], bucket_num);
|
||||
each_bucket_size->at(bucket_id)++;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename DataType, typename IndexType>
|
||||
static void SplitAndCalculateBucketSize(
|
||||
const std::shared_ptr<UniqueParam<DataType, IndexType>> ¶ms,
|
||||
std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> *segments_ptr,
|
||||
std::vector<std::shared_ptr<std::vector<IndexType>>> *segment_bucket_sizes_ptr) {
|
||||
MS_EXCEPTION_IF_NULL(params);
|
||||
MS_EXCEPTION_IF_NULL(params->input_);
|
||||
MS_EXCEPTION_IF_NULL(segments_ptr);
|
||||
MS_EXCEPTION_IF_NULL(segment_bucket_sizes_ptr);
|
||||
auto &segments = *segments_ptr;
|
||||
auto &segment_bucket_sizes = *segment_bucket_sizes_ptr;
|
||||
|
||||
IndexType input_size = params->input_size_;
|
||||
size_t thread_num = params->thread_num_;
|
||||
if (thread_num < 1) {
|
||||
MS_LOG(EXCEPTION) << "Thread num must > 0 !";
|
||||
}
|
||||
IndexType thread_data_size = input_size / thread_num;
|
||||
size_t left_data_size = input_size % thread_num;
|
||||
std::vector<std::thread> threads;
|
||||
threads.reserve(thread_num);
|
||||
segments.reserve(thread_num);
|
||||
segment_bucket_sizes.reserve(thread_num);
|
||||
IndexType current_offset = 0;
|
||||
for (size_t i = 0; i < thread_num; ++i) {
|
||||
segment_bucket_sizes.emplace_back(std::make_shared<std::vector<IndexType>>(thread_num, 0));
|
||||
IndexType data_size = thread_data_size;
|
||||
if (i < left_data_size) {
|
||||
data_size += 1;
|
||||
}
|
||||
segments.emplace_back(std::make_shared<UniqueParam<DataType, IndexType>>());
|
||||
segments[i]->input_ = params->input_ + current_offset;
|
||||
segments[i]->input_size_ = data_size;
|
||||
segments[i]->thread_num_ = thread_num;
|
||||
threads.emplace_back(
|
||||
std::thread(CalculateEachBucketSize<DataType, IndexType>, segments[i], segment_bucket_sizes[i].get()));
|
||||
current_offset += data_size;
|
||||
}
|
||||
for (size_t i = 0; i < params->thread_num_; ++i) {
|
||||
threads[i].join();
|
||||
}
|
||||
}
|
||||
|
||||
template <typename DataType, typename IndexType>
|
||||
static void SegmentToBuckets(const std::shared_ptr<UniqueParam<DataType, IndexType>> &segment,
|
||||
IndexType segment_offset,
|
||||
const std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> &buckets) {
|
||||
MS_LOG(DEBUG) << "Start";
|
||||
MS_EXCEPTION_IF_NULL(segment);
|
||||
MS_EXCEPTION_IF_NULL(segment->input_);
|
||||
std::vector<IndexType> bucket_data_num(segment->thread_num_, 0);
|
||||
auto bucket_size = buckets.size();
|
||||
for (IndexType i = 0; i < segment->input_size_; ++i) {
|
||||
DataType data = segment->input_[i];
|
||||
auto bucket_id = BucketId(data, segment->thread_num_);
|
||||
auto bucket_index = bucket_data_num[bucket_id];
|
||||
if (bucket_id >= bucket_size) {
|
||||
MS_LOG(ERROR) << "Error bucket id!";
|
||||
continue;
|
||||
}
|
||||
auto &bucket = buckets[bucket_id];
|
||||
MS_EXCEPTION_IF_NULL(bucket);
|
||||
if (bucket_index >= bucket->input_size_) {
|
||||
MS_LOG(ERROR) << "Error bucket index!";
|
||||
continue;
|
||||
}
|
||||
bucket->input_[bucket_index] = data;
|
||||
bucket->workspace_idx_[bucket_index] = segment_offset + i;
|
||||
bucket_data_num[bucket_id]++;
|
||||
}
|
||||
MS_LOG(DEBUG) << "End";
|
||||
}
|
||||
|
||||
template <typename DataType, typename IndexType>
|
||||
static void GatherSegmentsToBuckets(const std::shared_ptr<UniqueParam<DataType, IndexType>> ¶ms,
|
||||
std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> *segments_ptr,
|
||||
std::vector<std::shared_ptr<std::vector<IndexType>>> *segment_bucket_sizes_ptr,
|
||||
std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> *buckets_ptr) {
|
||||
MS_LOG(DEBUG) << "Start";
|
||||
MS_EXCEPTION_IF_NULL(params);
|
||||
MS_EXCEPTION_IF_NULL(params->workspace_);
|
||||
MS_EXCEPTION_IF_NULL(params->inverse_idx_);
|
||||
MS_EXCEPTION_IF_NULL(params->workspace_idx_);
|
||||
MS_EXCEPTION_IF_NULL(params->output_);
|
||||
MS_EXCEPTION_IF_NULL(params->input_idx_);
|
||||
MS_EXCEPTION_IF_NULL(segments_ptr);
|
||||
MS_EXCEPTION_IF_NULL(segment_bucket_sizes_ptr);
|
||||
MS_EXCEPTION_IF_NULL(buckets_ptr);
|
||||
auto &segments = *segments_ptr;
|
||||
auto &segment_bucket_sizes = *segment_bucket_sizes_ptr;
|
||||
auto &buckets = *buckets_ptr;
|
||||
auto thread_num = segments.size();
|
||||
buckets.reserve(thread_num);
|
||||
std::vector<IndexType> bucket_data_size(thread_num, 0);
|
||||
for (size_t i = 0; i < thread_num; ++i) {
|
||||
for (size_t j = 0; j < thread_num; ++j) {
|
||||
bucket_data_size[j] += segment_bucket_sizes[i]->at(j);
|
||||
}
|
||||
}
|
||||
|
||||
IndexType current_offset = 0;
|
||||
for (size_t i = 0; i < thread_num; ++i) {
|
||||
auto bucket = std::make_shared<UniqueParam<DataType, IndexType>>();
|
||||
bucket->input_ = params->output_ + current_offset;
|
||||
bucket->input_idx_ = params->inverse_idx_ + current_offset;
|
||||
bucket->workspace_idx_ = params->workspace_idx_ + current_offset;
|
||||
bucket->output_ = params->workspace_ + current_offset;
|
||||
bucket->inverse_idx_ = params->input_idx_ + current_offset;
|
||||
bucket->input_size_ = bucket_data_size[i];
|
||||
current_offset += bucket_data_size[i];
|
||||
buckets.emplace_back(bucket);
|
||||
}
|
||||
std::vector<IndexType> tmp_bucket_data_size(thread_num, 0);
|
||||
std::vector<std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>>> thread_buckets;
|
||||
for (size_t i = 0; i < thread_num; ++i) {
|
||||
std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> local_buckets;
|
||||
for (size_t j = 0; j < thread_num; ++j) {
|
||||
auto bucket = std::make_shared<UniqueParam<DataType, IndexType>>();
|
||||
bucket->input_ = buckets[j]->input_ + tmp_bucket_data_size[j];
|
||||
bucket->input_size_ = buckets[j]->input_size_ - tmp_bucket_data_size[j];
|
||||
bucket->workspace_idx_ = buckets[j]->workspace_idx_ + tmp_bucket_data_size[j];
|
||||
local_buckets.emplace_back(bucket);
|
||||
tmp_bucket_data_size[j] += segment_bucket_sizes[i]->at(j);
|
||||
}
|
||||
thread_buckets.emplace_back(local_buckets);
|
||||
}
|
||||
std::vector<std::thread> threads;
|
||||
threads.reserve(thread_num);
|
||||
current_offset = 0;
|
||||
for (size_t i = 0; i < thread_num; ++i) {
|
||||
MS_EXCEPTION_IF_NULL(segments[i]);
|
||||
threads.emplace_back(
|
||||
std::thread(SegmentToBuckets<DataType, IndexType>, segments[i], current_offset, thread_buckets[i]));
|
||||
current_offset += segments[i]->input_size_;
|
||||
}
|
||||
for (size_t i = 0; i < thread_num; ++i) {
|
||||
threads[i].join();
|
||||
}
|
||||
MS_LOG(DEBUG) << "End";
|
||||
}
|
||||
|
||||
template <typename DataType, typename IndexType>
|
||||
static void Unique(const std::shared_ptr<UniqueParam<DataType, IndexType>> ¶ms) {
|
||||
MS_LOG(DEBUG) << "Start";
|
||||
MS_EXCEPTION_IF_NULL(params);
|
||||
DataType *input = params->input_;
|
||||
IndexType *input_idx = params->input_idx_;
|
||||
DataType *output = params->output_;
|
||||
IndexType *inverse_idx = params->inverse_idx_;
|
||||
MS_EXCEPTION_IF_NULL(input);
|
||||
MS_EXCEPTION_IF_NULL(input_idx);
|
||||
MS_EXCEPTION_IF_NULL(output);
|
||||
MS_EXCEPTION_IF_NULL(inverse_idx);
|
||||
IndexType j = 0;
|
||||
if (params->need_sort_) {
|
||||
for (IndexType i = 0; i < params->input_size_; ++i) {
|
||||
input_idx[i] = i;
|
||||
}
|
||||
std::sort(input_idx, input_idx + params->input_size_,
|
||||
[&](IndexType left, IndexType right) { return input[left] < input[right]; });
|
||||
DataType last = input[0];
|
||||
for (IndexType i = 0; i < params->input_size_; ++i) {
|
||||
auto curr = input[input_idx[i]];
|
||||
if (i == 0 || curr != last) {
|
||||
if (i != 0) {
|
||||
j++;
|
||||
}
|
||||
output[j] = curr;
|
||||
inverse_idx[input_idx[i]] = j;
|
||||
last = curr;
|
||||
} else {
|
||||
inverse_idx[input_idx[i]] = j;
|
||||
}
|
||||
}
|
||||
params->output_size_ = j + 1;
|
||||
} else {
|
||||
std::unordered_map<DataType, IndexType> uniq;
|
||||
uniq.reserve(params->input_size_);
|
||||
for (IndexType i = 0; i < params->input_size_; ++i) {
|
||||
auto it = uniq.emplace(input[i], j);
|
||||
inverse_idx[i] = it.first->second;
|
||||
if (it.second) {
|
||||
++j;
|
||||
}
|
||||
}
|
||||
for (const auto &it : uniq) {
|
||||
output[it.second] = it.first;
|
||||
}
|
||||
params->output_size_ = j;
|
||||
}
|
||||
MS_LOG(DEBUG) << "End";
|
||||
}
|
||||
|
||||
template <typename DataType, typename IndexType>
|
||||
static void UniqueEachBucket(const std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> &buckets) {
|
||||
MS_LOG(DEBUG) << "Start";
|
||||
size_t thread_num = buckets.size();
|
||||
std::vector<std::thread> threads;
|
||||
threads.reserve(thread_num);
|
||||
for (size_t i = 0; i < thread_num; ++i) {
|
||||
threads.emplace_back(std::thread(Unique<DataType, IndexType>, buckets[i]));
|
||||
}
|
||||
for (size_t i = 0; i < thread_num; ++i) {
|
||||
threads[i].join();
|
||||
}
|
||||
MS_LOG(DEBUG) << "End";
|
||||
}
|
||||
|
||||
template <typename DataType, typename IndexType>
|
||||
static void TransformBucketReverseIndices(const std::shared_ptr<UniqueParam<DataType, IndexType>> &bucket,
|
||||
const std::shared_ptr<UniqueParam<DataType, IndexType>> &result,
|
||||
IndexType offset) {
|
||||
MS_EXCEPTION_IF_NULL(bucket);
|
||||
MS_EXCEPTION_IF_NULL(bucket->inverse_idx_);
|
||||
MS_EXCEPTION_IF_NULL(bucket->workspace_idx_);
|
||||
MS_EXCEPTION_IF_NULL(result);
|
||||
MS_EXCEPTION_IF_NULL(result->inverse_idx_);
|
||||
for (IndexType i = 0; i < bucket->input_size_; ++i) {
|
||||
auto origin_idx = bucket->workspace_idx_[i];
|
||||
if (origin_idx >= 0 && origin_idx < result->input_size_) {
|
||||
result->inverse_idx_[origin_idx] = bucket->inverse_idx_[i] + offset;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename DataType, typename IndexType>
|
||||
static void MergeBuckets(const std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> &buckets,
|
||||
const std::shared_ptr<UniqueParam<DataType, IndexType>> &result) {
|
||||
MS_LOG(DEBUG) << "Start";
|
||||
MS_EXCEPTION_IF_NULL(result);
|
||||
MS_EXCEPTION_IF_NULL(result->output_);
|
||||
size_t thread_num = buckets.size();
|
||||
std::vector<IndexType> bucket_offsets(thread_num);
|
||||
IndexType current_size = 0;
|
||||
for (size_t i = 0; i < thread_num; ++i) {
|
||||
auto bucket = buckets[i];
|
||||
MS_EXCEPTION_IF_NULL(bucket);
|
||||
MS_EXCEPTION_IF_NULL(bucket->output_);
|
||||
bucket_offsets[i] = current_size;
|
||||
auto ret_code = memcpy_s(result->output_ + current_size, (result->input_size_ - current_size) * sizeof(DataType),
|
||||
bucket->output_, bucket->output_size_ * sizeof(DataType));
|
||||
if (ret_code != EOK) {
|
||||
MS_LOG(EXCEPTION) << "Failed to copy data!";
|
||||
}
|
||||
current_size += bucket->output_size_;
|
||||
}
|
||||
result->output_size_ = current_size;
|
||||
|
||||
std::vector<std::thread> threads;
|
||||
threads.reserve(thread_num);
|
||||
for (size_t i = 0; i < thread_num; ++i) {
|
||||
threads.emplace_back(
|
||||
std::thread(TransformBucketReverseIndices<DataType, IndexType>, buckets[i], result, bucket_offsets[i]));
|
||||
}
|
||||
for (size_t i = 0; i < thread_num; ++i) {
|
||||
threads[i].join();
|
||||
}
|
||||
MS_LOG(DEBUG) << "End";
|
||||
}
|
||||
|
||||
template <typename DataType, typename IndexType>
|
||||
static void BucketUnique(const std::shared_ptr<UniqueParam<DataType, IndexType>> ¶ms) {
|
||||
MS_EXCEPTION_IF_NULL(params);
|
||||
std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> segments;
|
||||
std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> buckets;
|
||||
std::vector<std::shared_ptr<std::vector<IndexType>>> segment_bucket_sizes;
|
||||
SplitAndCalculateBucketSize(params, &segments, &segment_bucket_sizes);
|
||||
GatherSegmentsToBuckets(params, &segments, &segment_bucket_sizes, &buckets);
|
||||
UniqueEachBucket(buckets);
|
||||
MergeBuckets(buckets, params);
|
||||
}
|
||||
};
|
||||
|
||||
MS_REG_CPU_KERNEL(
|
||||
|
|
|
@ -19,49 +19,33 @@
|
|||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
void UniqueWithPadCPUKernel::InitKernel(const CNodePtr &kernel_node) {
|
||||
CheckParam(kernel_node);
|
||||
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
n_ = SizeToLong(input_shape[0]);
|
||||
dtype_ = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0);
|
||||
}
|
||||
|
||||
bool UniqueWithPadCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
|
||||
const std::vector<kernel::AddressPtr> & /*workspace*/,
|
||||
const std::vector<kernel::AddressPtr> &workspace,
|
||||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
UniqueCPUKernel::Launch(inputs, workspace, outputs);
|
||||
if (dtype_ == kNumberTypeInt32) {
|
||||
LaunchKernel<int>(inputs, outputs);
|
||||
PadOutput<int>(inputs, outputs);
|
||||
} else if (dtype_ == kNumberTypeInt64) {
|
||||
LaunchKernel<int64_t>(inputs, outputs);
|
||||
} else {
|
||||
MS_LOG(EXCEPTION) << "Only unsupported int32 or int64 dtype";
|
||||
PadOutput<int64_t>(inputs, outputs);
|
||||
} else if (dtype_ == kNumberTypeFloat32) {
|
||||
PadOutput<float>(inputs, outputs);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void UniqueWithPadCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
|
||||
const std::vector<AddressPtr> &outputs) {
|
||||
T *a = reinterpret_cast<T *>(inputs[0]->addr);
|
||||
void UniqueWithPadCPUKernel::PadOutput(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
|
||||
if (inputs.size() < 2) {
|
||||
MS_LOG(EXCEPTION) << "Input size should be large than 1";
|
||||
}
|
||||
if (outputs.size() < 1) {
|
||||
MS_LOG(EXCEPTION) << "Output size should be large than 0";
|
||||
}
|
||||
T pad_num = *reinterpret_cast<T *>(inputs[1]->addr);
|
||||
T *out = reinterpret_cast<T *>(outputs[0]->addr);
|
||||
T *idx_vec = reinterpret_cast<T *>(outputs[1]->addr);
|
||||
|
||||
for (int64_t i = 0; i < n_; ++i) {
|
||||
for (size_t i = output_size_; i < input_size_; ++i) {
|
||||
out[i] = pad_num;
|
||||
}
|
||||
std::unordered_map<T, int> uniq;
|
||||
uniq.reserve(n_);
|
||||
for (int64_t i = 0, j = 0; i < n_; ++i) {
|
||||
auto it = uniq.emplace(a[i], j);
|
||||
idx_vec[i] = it.first->second;
|
||||
if (it.second) {
|
||||
++j;
|
||||
}
|
||||
}
|
||||
for (const auto &it : uniq) {
|
||||
out[it.second] = it.first;
|
||||
}
|
||||
}
|
||||
|
||||
void UniqueWithPadCPUKernel::CheckParam(const CNodePtr &kernel_node) {
|
||||
|
|
|
@ -16,31 +16,26 @@
|
|||
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_UNIQUE_WITH_PAD_CPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_UNIQUE_WITH_PAD_CPU_KERNEL_H_
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
|
||||
#include "backend/kernel_compiler/cpu/unique_cpu_kernel.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
class UniqueWithPadCPUKernel : public CPUKernel {
|
||||
class UniqueWithPadCPUKernel : public UniqueCPUKernel {
|
||||
public:
|
||||
UniqueWithPadCPUKernel() = default;
|
||||
~UniqueWithPadCPUKernel() override = default;
|
||||
|
||||
void InitKernel(const CNodePtr &kernel_node) override;
|
||||
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) override;
|
||||
|
||||
template <typename T>
|
||||
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
|
||||
void PadOutput(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
|
||||
|
||||
private:
|
||||
void CheckParam(const CNodePtr &kernel_node);
|
||||
int64_t n_{0};
|
||||
TypeId dtype_{kTypeUnknown};
|
||||
protected:
|
||||
void CheckParam(const CNodePtr &kernel_node) override;
|
||||
};
|
||||
|
||||
MS_REG_CPU_KERNEL(UniqueWithPad,
|
||||
|
@ -56,7 +51,15 @@ MS_REG_CPU_KERNEL(UniqueWithPad,
|
|||
.AddInputAttr(kNumberTypeInt64)
|
||||
.AddInputAttr(kNumberTypeInt64)
|
||||
.AddOutputAttr(kNumberTypeInt64)
|
||||
.AddOutputAttr(kNumberTypeInt64),
|
||||
.AddOutputAttr(kNumberTypeInt32),
|
||||
UniqueWithPadCPUKernel);
|
||||
|
||||
MS_REG_CPU_KERNEL(UniqueWithPad,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeInt32),
|
||||
UniqueWithPadCPUKernel);
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -33,7 +33,7 @@ class Net(nn.Cell):
|
|||
return self.uniq(x)
|
||||
|
||||
|
||||
def test_net():
|
||||
def test_net_fp32():
|
||||
x = Tensor(np.array([1, 2, 5, 2]), mstype.float32)
|
||||
uniq = Net()
|
||||
output = uniq(x)
|
||||
|
@ -45,3 +45,31 @@ def test_net():
|
|||
|
||||
assert (output[0].asnumpy() == expect_y_result).all()
|
||||
assert (output[1].asnumpy() == expect_idx_result).all()
|
||||
|
||||
|
||||
def test_net_int32():
|
||||
x = Tensor(np.array([1, 2, 5, 2]), mstype.int32)
|
||||
uniq = Net()
|
||||
output = uniq(x)
|
||||
print("x:\n", x)
|
||||
print("y:\n", output[0])
|
||||
print("idx:\n", output[1])
|
||||
expect_y_result = [1, 2, 5]
|
||||
expect_idx_result = [0, 1, 2, 1]
|
||||
|
||||
assert (output[0].asnumpy() == expect_y_result).all()
|
||||
assert (output[1].asnumpy() == expect_idx_result).all()
|
||||
|
||||
|
||||
def test_net_int64():
|
||||
x = Tensor(np.array([1, 2, 5, 2]), mstype.int64)
|
||||
uniq = Net()
|
||||
output = uniq(x)
|
||||
print("x:\n", x)
|
||||
print("y:\n", output[0])
|
||||
print("idx:\n", output[1])
|
||||
expect_y_result = [1, 2, 5]
|
||||
expect_idx_result = [0, 1, 2, 1]
|
||||
|
||||
assert (output[0].asnumpy() == expect_y_result).all()
|
||||
assert (output[1].asnumpy() == expect_idx_result).all()
|
||||
|
|
|
@ -29,7 +29,7 @@ class UniqueCpuKernelTest : public UT::Common {
|
|||
UniqueCpuKernelTest() : unique_(std::make_shared<UniqueCPUKernel>()) {}
|
||||
|
||||
void SetUp() override {
|
||||
unique_->n_ = 9;
|
||||
unique_->input_size_ = 9;
|
||||
unique_->dtype_ = kNumberTypeFloat32;
|
||||
inputs_.clear();
|
||||
workspace_.clear();
|
||||
|
@ -42,16 +42,19 @@ class UniqueCpuKernelTest : public UT::Common {
|
|||
return kernel_addr;
|
||||
}
|
||||
|
||||
void CreateInputAddress() { inputs_.push_back(CreateKernelAddress(x_.data())); }
|
||||
|
||||
void CreateOutputAddress() {
|
||||
void CreateAddress() {
|
||||
inputs_.push_back(CreateKernelAddress(x_.data()));
|
||||
outputs_.push_back(CreateKernelAddress(y_.data()));
|
||||
outputs_.push_back(CreateKernelAddress(idx_.data()));
|
||||
workspace_.push_back(CreateKernelAddress(workspace_idx_.data()));
|
||||
workspace_.push_back(CreateKernelAddress(workspace_idx_.data()));
|
||||
workspace_.push_back(CreateKernelAddress(workspace_idx_.data()));
|
||||
}
|
||||
|
||||
std::vector<float> x_;
|
||||
std::vector<float> y_;
|
||||
std::vector<int64_t> idx_;
|
||||
std::vector<int> idx_;
|
||||
std::vector<int64_t> workspace_idx_;
|
||||
std::vector<AddressPtr> inputs_;
|
||||
std::vector<AddressPtr> workspace_;
|
||||
std::vector<AddressPtr> outputs_;
|
||||
|
@ -62,13 +65,13 @@ TEST_F(UniqueCpuKernelTest, compute_test) {
|
|||
x_ = {1, 1, 2, 4, 4, 4, 7, 8, 8};
|
||||
y_ = {1, 1, 1, 1, 1};
|
||||
idx_ = {1, 1, 1, 1, 1, 1, 1, 1, 1};
|
||||
CreateInputAddress();
|
||||
CreateOutputAddress();
|
||||
workspace_idx_ = {1, 1, 1, 1, 1, 1, 1, 1, 1};
|
||||
CreateAddress();
|
||||
unique_->Launch(inputs_, workspace_, outputs_);
|
||||
|
||||
// check compute result
|
||||
std::vector<float> expect_y{1, 2, 4, 7, 8};
|
||||
std::vector<int64_t> expect_idx{0, 0, 1, 2, 2, 2, 3, 4, 4};
|
||||
std::vector<int> expect_idx{0, 0, 1, 2, 2, 2, 3, 4, 4};
|
||||
EXPECT_TRUE(y_ == expect_y);
|
||||
EXPECT_TRUE(idx_ == expect_idx);
|
||||
}
|
||||
|
|
|
@ -29,7 +29,7 @@ class UniqueWithPadCpuKernelTest : public UT::Common {
|
|||
UniqueWithPadCpuKernelTest() : unique_with_pad_(std::make_shared<UniqueWithPadCPUKernel>()) {}
|
||||
|
||||
void SetUp() override {
|
||||
unique_with_pad_->n_ = 10;
|
||||
unique_with_pad_->input_size_ = 10;
|
||||
unique_with_pad_->dtype_ = kNumberTypeInt64;
|
||||
inputs_.clear();
|
||||
workspace_.clear();
|
||||
|
@ -42,21 +42,21 @@ class UniqueWithPadCpuKernelTest : public UT::Common {
|
|||
return kernel_addr;
|
||||
}
|
||||
|
||||
void CreateInputAddress() {
|
||||
void CreateAddress() {
|
||||
inputs_.push_back(CreateKernelAddress(x_.data()));
|
||||
inputs_.push_back(CreateKernelAddress(&pad_dim_));
|
||||
;
|
||||
}
|
||||
|
||||
void CreateOutputAddress() {
|
||||
outputs_.push_back(CreateKernelAddress(out_.data()));
|
||||
outputs_.push_back(CreateKernelAddress(idx_.data()));
|
||||
workspace_.push_back(CreateKernelAddress(workspace_idx_.data()));
|
||||
workspace_.push_back(CreateKernelAddress(workspace_idx_.data()));
|
||||
workspace_.push_back(CreateKernelAddress(workspace_idx_.data()));
|
||||
}
|
||||
|
||||
std::vector<int64_t> x_;
|
||||
int64_t pad_dim_;
|
||||
std::vector<int64_t> out_;
|
||||
std::vector<int64_t> idx_;
|
||||
std::vector<int> idx_;
|
||||
std::vector<int64_t> workspace_idx_;
|
||||
std::vector<AddressPtr> inputs_;
|
||||
std::vector<AddressPtr> workspace_;
|
||||
std::vector<AddressPtr> outputs_;
|
||||
|
@ -68,13 +68,13 @@ TEST_F(UniqueWithPadCpuKernelTest, compute_test) {
|
|||
pad_dim_ = 8;
|
||||
out_ = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
|
||||
idx_ = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
|
||||
CreateInputAddress();
|
||||
CreateOutputAddress();
|
||||
workspace_idx_ = {1, 1, 1, 1, 1, 1, 1, 1, 1};
|
||||
CreateAddress();
|
||||
unique_with_pad_->Launch(inputs_, workspace_, outputs_);
|
||||
|
||||
// check compute result
|
||||
std::vector<int64_t> expect_out{1, 5, 4, 3, 2, 8, 8, 8, 8, 8};
|
||||
std::vector<int64_t> expect_idx{0, 0, 1, 1, 2, 2, 3, 3, 4, 4};
|
||||
std::vector<int64_t> expect_out{1, 2, 3, 4, 5, 8, 8, 8, 8, 8};
|
||||
std::vector<int> expect_idx{0, 0, 4, 4, 3, 3, 2, 2, 1, 1};
|
||||
EXPECT_TRUE(out_ == expect_out);
|
||||
EXPECT_TRUE(idx_ == expect_idx);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue