migrates aicpu kernels to MS from lqk

This commit is contained in:
lilinjie 2022-12-21 11:40:36 +08:00
parent 2f3d008c2b
commit 389acff921
21 changed files with 2012 additions and 144 deletions

View File

@ -86,4 +86,4 @@
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "variableScope"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "constParameter"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "constVariable"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "unreadVariable"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "unreadVariable"

View File

@ -130,4 +130,5 @@
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "readability/namespace"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "whitespace/braces"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "build/include"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "whitespace/end_of_line"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "whitespace/end_of_line"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "readability/casting"

View File

@ -387,6 +387,7 @@ constexpr auto kLSTMOpName = "LSTM";
constexpr auto kLuUnpackOpName = "LuUnpack";
constexpr auto kMaskedFillOpName = "MaskedFill";
constexpr auto kMaskedSelectOpName = "MaskedSelect";
constexpr auto kMaskedSelectGradOpName = "MaskedSelectGrad";
constexpr auto kMatMulOpName = "MatMul";
constexpr auto kMatMulV2OpName = "MatMulV2";
constexpr auto kMatrixDiagOpName = "MatrixDiag";

View File

@ -1,127 +0,0 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "cpu_kernel/ms_kernel/acos.h"
#include <unsupported/Eigen/CXX11/Tensor>
#include <algorithm>
#include "cpu_kernel/common/cpu_kernel_utils.h"
#include "cpu_kernel/inc/cpu_types.h"
#include "mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/common/kernel_log.h"
#include "cpu_kernel/common/status.h"
#include "utils/kernel_util.h"
namespace {
const std::uint32_t kAcosInputNum{1u};
const std::uint32_t kAcosOutputNum{1u};
const char *const kAcos{"Acos"};
const std::int64_t kAcosParallelNum{64 * 1024};
} // namespace
namespace aicpu {
namespace detail {
template <typename T>
inline T ScalarAcos(const T x) {
return std::acos(x);
}
template <>
inline Eigen::half ScalarAcos(const Eigen::half x) {
const Eigen::half val{static_cast<Eigen::half>(std::acos(static_cast<std::float_t>(x)))};
return val;
}
inline std::uint32_t ParallelForAcos(const CpuKernelContext &ctx, std::int64_t total, std::int64_t per_unit_size,
const std::function<void(std::int64_t, std::int64_t)> &work) {
if (total > kAcosParallelNum)
return aicpu::CpuKernelUtils::ParallelFor(ctx, total, per_unit_size, work);
else
work(0, total);
return KERNEL_STATUS_OK;
}
template <typename T>
inline std::uint32_t ComputeAcosKernel(const CpuKernelContext &ctx) {
T *input0{static_cast<T *>(ctx.Input(0)->GetData())};
T *output{static_cast<T *>(ctx.Output(0)->GetData())};
std::int64_t total{ctx.Input(0)->NumElements()};
std::uint32_t cores{aicpu::CpuKernelUtils::GetCPUNum(ctx)};
std::int64_t per_unit_size{total / std::min(std::max(1L, cores - 2L), total)};
return ParallelForAcos(ctx, total, per_unit_size, [&](std::int64_t begin, std::int64_t end) {
std::transform(input0 + begin, input0 + end, output + begin, ScalarAcos<T>);
});
}
template <typename T>
inline std::uint32_t ComputeAcos(const CpuKernelContext &ctx) {
std::uint32_t result{ComputeAcosKernel<T>(ctx)};
if (result != KERNEL_STATUS_OK) {
KERNEL_LOG_ERROR("Acos compute failed.");
}
return result;
}
inline std::uint32_t ExtraCheckAcos(const CpuKernelContext &ctx) {
if (ctx.Input(0)->GetData() == nullptr) {
KERNEL_LOG_ERROR("Get input data failed.");
return KERNEL_STATUS_PARAM_INVALID;
}
if (ctx.Output(0)->GetData() == nullptr) {
KERNEL_LOG_ERROR("Get output data failed.");
return KERNEL_STATUS_PARAM_INVALID;
}
if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
KERNEL_LOG_ERROR("The data type of the input [%s] need be the same as the output [%s].",
DTypeStr(ctx.Input(0)->GetDataType()).c_str(), DTypeStr(ctx.Output(0)->GetDataType()).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (ctx.Input(0)->GetDataSize() != ctx.Output(0)->GetDataSize()) {
KERNEL_LOG_ERROR(
"The data size of the input [%llu] need be the same as the output "
"[%llu].",
ctx.Input(0)->GetDataSize(), ctx.Output(0)->GetDataSize());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
inline std::uint32_t CheckAcos(const CpuKernelContext &ctx, std::uint32_t inputs_num, std::uint32_t outputs_num) {
return NormalCheck(ctx, inputs_num, outputs_num) ? KERNEL_STATUS_PARAM_INVALID : ExtraCheckAcos(ctx);
}
inline std::uint32_t ComputeAcos(const CpuKernelContext &ctx) {
DataType input_type{ctx.Input(0)->GetDataType()};
switch (input_type) {
case DT_FLOAT16:
return ComputeAcos<Eigen::half>(ctx);
case DT_FLOAT:
return ComputeAcos<std::float_t>(ctx);
case DT_DOUBLE:
return ComputeAcos<std::double_t>(ctx);
default:
KERNEL_LOG_ERROR("Unsupported input data type [%s].", DTypeStr(input_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
} // namespace detail
std::uint32_t AcosCpuKernel::Compute(const CpuKernelContext &ctx) {
return detail::CheckAcos(ctx, kAcosInputNum, kAcosOutputNum) ? KERNEL_STATUS_PARAM_INVALID : detail::ComputeAcos(ctx);
}
REGISTER_CPU_KERNEL(kAcos, AcosCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,154 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "cache_swap_table.h"
#include <securec.h>
#include <map>
#include "cpu_types.h"
#include "kernel_log.h"
#include "status.h"
#include "utils/sparse_tensor.h"
#include "utils/kernel_util.h"
namespace {
const char *const kCacheSwapTable = "CacheSwapTable";
}
namespace aicpu {
template <typename T>
uint32_t CacheSwapTableTask(std::vector<Tensor *> &inputs, std::vector<Tensor *> &outputs, int64_t batch_size,
int64_t output_size, int64_t one_line_col, int type_size) {
if (inputs.size() == 0 || outputs.size() == 0) {
KERNEL_LOG_ERROR("CacheSwapTable input or output is empty.");
return KERNEL_STATUS_PARAM_INVALID;
}
char *cache_table = reinterpret_cast<char *>(inputs[0]->GetData());
T *swap_cache_idx = reinterpret_cast<T *>(inputs[1]->GetData());
uint64_t swap_cache_idx_size = inputs[1]->GetDataSize();
char *miss_value = reinterpret_cast<char *>(inputs[2]->GetData());
char *old_value = reinterpret_cast<char *>(outputs[0]->GetData());
errno_t ret = memset_s(old_value, static_cast<size_t>(output_size * type_size), 0x00,
static_cast<size_t>(output_size * type_size));
if (ret != EOK) {
KERNEL_LOG_ERROR("Memset failed, result[%d]", ret);
return KERNEL_STATUS_INNER_ERROR;
}
uint64_t single_copy_size = static_cast<uint64_t>(type_size * one_line_col);
if (swap_cache_idx_size < static_cast<uint64_t>(batch_size)) {
KERNEL_LOG_ERROR(
"The value of swap_cache_idx_size:[%llu] must be less than "
"batch_size:[%lld]",
swap_cache_idx_size, batch_size);
return KERNEL_STATUS_INNER_ERROR;
}
uint64_t old_value_size = outputs[0]->GetDataSize();
uint64_t cache_table_size = inputs[0]->GetDataSize();
for (int64_t i = 0; i < batch_size; ++i) {
if (swap_cache_idx[i] < 0) {
continue;
}
ret = memcpy_s(old_value + i * single_copy_size, old_value_size, cache_table + swap_cache_idx[i] * single_copy_size,
single_copy_size);
old_value_size -= single_copy_size;
if (ret != EOK) {
KERNEL_LOG_ERROR("CacheSwapTable memcpy failed, result [%d].", ret);
return KERNEL_STATUS_INNER_ERROR;
}
ret = memcpy_s(cache_table + swap_cache_idx[i] * single_copy_size, cache_table_size,
miss_value + i * single_copy_size, single_copy_size);
cache_table_size -= single_copy_size;
if (ret != EOK) {
KERNEL_LOG_ERROR("CacheSwapTable memcpy failed, result [%d].", ret);
return KERNEL_STATUS_INNER_ERROR;
}
}
return KERNEL_STATUS_OK;
}
uint32_t CacheSwapTableMsCpuKernel::DoCompute() {
std::map<int, std::function<uint32_t(std::vector<Tensor *> &, std::vector<Tensor *> &, int64_t &, int64_t &,
int64_t &, int &)>>
calls;
calls[DT_INT32] = CacheSwapTableTask<int32_t>;
calls[DT_INT64] = CacheSwapTableTask<int64_t>;
if (calls.find(indices_type_) == calls.end()) {
KERNEL_LOG_ERROR(
"CacheSwapTableMsCpuKernel op doesn't support indices tensor types: "
"[%s]",
DTypeStr(indices_type_).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
int type_size = GetSizeByDataType(param_type_);
return calls[indices_type_](inputs_, outputs_, batch_size_, output_size_, one_line_col_, type_size);
}
uint32_t CacheSwapTableMsCpuKernel::GetInputAndCheck(const CpuKernelContext &ctx) {
KERNEL_LOG_INFO("GetInputAndCheck start!");
// get input Tensors
const uint32_t kNumInput = 3;
for (uint32_t i = 0; i < kNumInput; ++i) {
Tensor *tensor = ctx.Input(i);
KERNEL_CHECK_NULLPTR(tensor, KERNEL_STATUS_PARAM_INVALID, "Get input tensor[%d] failed", i)
inputs_.push_back(tensor);
}
// get output Tensors
const uint32_t kNumOutput = 1;
for (uint32_t i = 0; i < kNumOutput; ++i) {
Tensor *tensor = ctx.Output(i);
KERNEL_CHECK_NULLPTR(tensor, KERNEL_STATUS_PARAM_INVALID, "Get output tensor[%d] failed", i)
outputs_.push_back(tensor);
}
// get param type
param_type_ = static_cast<DataType>(inputs_[0]->GetDataType());
indices_type_ = static_cast<DataType>(inputs_[1]->GetDataType());
KERNEL_LOG_INFO("GetInputAndCheck success!");
std::shared_ptr<TensorShape> cache_table_shape = ctx.Input(0)->GetTensorShape();
std::shared_ptr<TensorShape> indices_shape = ctx.Input(1)->GetTensorShape();
for (int32_t i = 1; i < cache_table_shape->GetDims(); ++i) {
KERNEL_CHECK_ASSIGN_64S_MULTI(one_line_col_, cache_table_shape->GetDimSize(i), one_line_col_,
KERNEL_STATUS_PARAM_INVALID);
}
for (int32_t i = 0; i < indices_shape->GetDims(); ++i) {
KERNEL_CHECK_ASSIGN_64S_MULTI(batch_size_, indices_shape->GetDimSize(i), batch_size_, KERNEL_STATUS_PARAM_INVALID);
}
output_size_ = batch_size_ * one_line_col_;
return KERNEL_STATUS_OK;
}
uint32_t CacheSwapTableMsCpuKernel::Compute(const CpuKernelContext &ctx) {
uint32_t res = GetInputAndCheck(ctx);
if (res != KERNEL_STATUS_OK) {
return res;
}
res = DoCompute();
if (res != KERNEL_STATUS_OK) {
KERNEL_LOG_ERROR("Compute failed");
return res;
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kCacheSwapTable, CacheSwapTableMsCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,44 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_CACHE_SWAP_TABLE_H
#define AICPU_KERNELS_NORMALIZED_CACHE_SWAP_TABLE_H
#include <cmath>
#include <vector>
#include "cpu_ops_kernel.h"
namespace aicpu {
class CacheSwapTableMsCpuKernel : public CpuKernel {
public:
~CacheSwapTableMsCpuKernel() = default;
uint32_t Compute(const CpuKernelContext &ctx) override;
private:
uint32_t DoCompute();
uint32_t GetInputAndCheck(const CpuKernelContext &ctx);
int64_t batch_size_ = 1;
int64_t one_line_col_ = 1;
int64_t output_size_ = 1;
std::vector<Tensor *> inputs_;
std::vector<Tensor *> outputs_;
DataType param_type_ = DT_FLOAT;
DataType indices_type_ = DT_INT32;
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,143 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fill.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const char *const kFill = "Fill";
}
namespace aicpu {
template <typename T>
void FillGenerateCase(Tensor *&value_tensor, Tensor *&output) {
auto value = *(reinterpret_cast<T *>(value_tensor->GetData()));
if (AddrAlignedCheck(output->GetData())) {
Eigen::TensorMap<Eigen::Tensor<T, 1>, Eigen::Aligned> eigen_output(static_cast<T *>(output->GetData()),
output->GetTensorShape()->NumElements());
eigen_output.setConstant(value);
} else {
Eigen::TensorMap<Eigen::Tensor<T, 1>, Eigen::Unaligned> eigen_output(static_cast<T *>(output->GetData()),
output->GetTensorShape()->NumElements());
eigen_output.setConstant(value);
}
}
uint32_t FillCpuKernel::GetDimsByType(const CpuKernelContext &ctx) {
dims.clear();
Tensor *dims_tensor = ctx.Input(0);
KERNEL_CHECK_NULLPTR(dims_tensor, KERNEL_STATUS_PARAM_INVALID, "Get dims input failed")
uint32_t ret;
auto dims_dtype = dims_tensor->GetDataType();
switch (dims_dtype) {
case (DT_INT32):
ret = CalcDims<int32_t>(dims_tensor, dims);
break;
case (DT_INT64):
ret = CalcDims<int64_t>(dims_tensor, dims);
break;
default:
KERNEL_LOG_ERROR(
"Fill kernel dims data_type [%u] not support, support data_types: "
"DT_INT32, DT_INT64",
dims_dtype);
return KERNEL_STATUS_PARAM_INVALID;
}
if (ret != KERNEL_STATUS_OK) {
KERNEL_LOG_ERROR("Fill kernel calculate dims failed");
}
return ret;
}
uint32_t FillCpuKernel::Compute(const CpuKernelContext &ctx) {
uint32_t check = GetDimsByType(ctx);
if (check != KERNEL_STATUS_OK) {
return check;
}
Tensor *value_tensor = ctx.Input(1);
KERNEL_CHECK_NULLPTR(value_tensor, KERNEL_STATUS_PARAM_INVALID, "Get value input failed")
KERNEL_CHECK_NULLPTR(value_tensor->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get value input data failed")
KERNEL_CHECK_NULLPTR(value_tensor->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Get value input shape failed")
if (!value_tensor->GetTensorShape()->GetDimSizes().empty()) {
KERNEL_LOG_ERROR("Fill kernel value input is not a scalar.");
return KERNEL_STATUS_PARAM_INVALID;
}
Tensor *output = ctx.Output(0);
KERNEL_CHECK_NULLPTR(output, KERNEL_STATUS_PARAM_INVALID, "Get output failed")
KERNEL_CHECK_NULLPTR(output->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output data failed")
KERNEL_CHECK_NULLPTR(output->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Get output shape failed")
if (output->GetTensorShape()->GetDimSizes() != dims) {
KERNEL_LOG_ERROR("Fill kernel output shape not matched.");
return KERNEL_STATUS_PARAM_INVALID;
}
auto input_dtype = value_tensor->GetDataType();
auto output_dtype = output->GetDataType();
if (input_dtype != output_dtype) {
KERNEL_LOG_ERROR("Fill kernel data type not matched, value input dtype [%u], output dtype [%u].", input_dtype,
output_dtype);
return KERNEL_STATUS_PARAM_INVALID;
}
std::map<int, std::function<void(Tensor *&, Tensor *&)>> calls;
calls[DT_INT8] = FillGenerateCase<int8_t>;
calls[DT_UINT8] = FillGenerateCase<uint8_t>;
calls[DT_INT16] = FillGenerateCase<int16_t>;
calls[DT_UINT16] = FillGenerateCase<uint16_t>;
calls[DT_INT32] = FillGenerateCase<int32_t>;
calls[DT_UINT32] = FillGenerateCase<uint32_t>;
calls[DT_INT64] = FillGenerateCase<int64_t>;
calls[DT_UINT64] = FillGenerateCase<uint64_t>;
calls[DT_BOOL] = FillGenerateCase<bool>;
calls[DT_FLOAT16] = FillGenerateCase<Eigen::half>;
calls[DT_FLOAT] = FillGenerateCase<float>;
calls[DT_DOUBLE] = FillGenerateCase<double>;
if (calls.find(output_dtype) == calls.end()) {
KERNEL_LOG_ERROR("Fill kernel data type [%u] not support", output_dtype);
return KERNEL_STATUS_PARAM_INVALID;
}
calls[output_dtype](value_tensor, output);
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t FillCpuKernel::CalcDims(const Tensor *dims_tensor, std::vector<int64_t> &dim_vec) {
uint64_t data_num = dims_tensor->GetDataSize() / sizeof(T);
if (data_num == 0) {
KERNEL_LOG_INFO("Fill kernel: dims is empty, fill scalar output.");
return KERNEL_STATUS_OK;
}
KERNEL_CHECK_NULLPTR(dims_tensor->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get dims data failed")
for (uint64_t i = 0; i < data_num; i++) {
auto dim = *(reinterpret_cast<const T *>(dims_tensor->GetData()) + i);
if (dim < 0) {
KERNEL_LOG_ERROR("Fill kernel: input dim [%llu] is negative, value=[%lld]", i, static_cast<int64_t>(dim));
return KERNEL_STATUS_PARAM_INVALID;
}
// zero dim is different from empty dim.
if (dim == 0) {
KERNEL_LOG_INFO("Fill kernel: input dim [%llu] is zero", i);
}
dim_vec.emplace_back(dim);
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kFill, FillCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,43 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_FILL_H
#define AICPU_KERNELS_NORMALIZED_FILL_H
#include "cpu_ops_kernel.h"
namespace aicpu {
class FillCpuKernel : public CpuKernel {
public:
FillCpuKernel() = default;
~FillCpuKernel() override = default;
uint32_t Compute(const CpuKernelContext &ctx) override;
private:
uint32_t GetDimsByType(const CpuKernelContext &ctx);
/**
* @brief calc dims from input dims tensor
* @param dims_tensor input dims tensor
* @param dims output shape dims
* @return status if success
*/
template <typename T>
uint32_t CalcDims(const Tensor *dims_tensor, std::vector<int64_t> &dims);
std::vector<int64_t> dims;
};
} // namespace aicpu
#endif // AICPU_KERNELS_NORMALIZED_FILL_H_

View File

@ -0,0 +1,293 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "masked_select.h"
#include <array>
#include <atomic>
#include <algorithm>
#include <vector>
#include "Eigen/Core"
#include "securec.h"
#include "cpu_kernel_utils.h"
#include "cpu_types.h"
#include "kernel_log.h"
#include "status.h"
#include "utils/broadcast_iterator.h"
#include "utils/kernel_util.h"
namespace {
constexpr uint32_t kMaskedSelectInputNum = 2;
constexpr uint32_t kMaskedSelectOutputNum = 1;
constexpr int64_t kParallelDataNums = 32 * 1000;
const char *const kMaskedSelect = "MaskedSelect";
struct OutputInfo {
int64_t startIdx;
int64_t len;
OutputInfo() {
startIdx = 0;
len = 0;
}
};
bool CompareFunc(const OutputInfo &a, const OutputInfo &b) { return a.startIdx <= b.startIdx; }
// calculate the index stride of dataShape.
// dataShape:[m, 1, k] and broadcastShape:[j, m, n, k] --> index_stride:[0, k, 0, 1]
std::vector<int64_t> CalIndexStride(const std::vector<int64_t> &dataShape, const std::vector<int64_t> &broadcastShape) {
int broadcastDimNum = broadcastShape.size();
int dataDimNum = dataShape.size();
int diffDimNum = broadcastDimNum - dataDimNum;
std::vector<int64_t> indexStride(broadcastDimNum, 0);
indexStride[broadcastDimNum - 1] = 1;
for (int i = broadcastDimNum - 1; i > diffDimNum; i--) {
indexStride[i - 1] = indexStride[i] * dataShape[i];
}
for (int i = 0; i < dataDimNum; i++) {
if (dataShape[i] == 1) {
indexStride[i + diffDimNum] = 0;
}
}
return indexStride;
}
// calculate the index stride of shape.
// shape:[m, n, k] --> index_stride:[n*k, k, 1]
std::vector<int64_t> CalIndexStride(const std::vector<int64_t> &shape) {
int dimNum = shape.size();
std::vector<int64_t> indexStride(dimNum, 1);
for (int i = dimNum - 1; i > 0; i--) {
indexStride[i - 1] = indexStride[i] * shape[i];
}
return indexStride;
}
// calculate the original index of data.
// shape:[7,8,9] indexStride:[72,9,1] and flatten_index:11--> ori_index:[0,1,2]
bool CalIndexInfo(const std::vector<int64_t> &indexStride, int64_t flattenIndex, std::vector<int64_t> &oriIndex,
int dimNum) {
for (int i = 0; i < dimNum - 1; i++) {
if (indexStride[i] == 0) {
return false;
}
oriIndex[i] = flattenIndex / indexStride[i];
flattenIndex = flattenIndex % indexStride[i];
}
oriIndex[dimNum - 1] = flattenIndex;
return true;
}
inline int64_t CalFlattenIndex(const std::vector<int64_t> &indexStride, const std::vector<int64_t> &oriIndex,
int dimNum) {
int64_t flattenIndex = 0;
for (int i = 0; i < dimNum; i++) {
flattenIndex += indexStride[i] * oriIndex[i];
}
return flattenIndex;
}
void UpdateIndexByCarry(std::vector<int64_t> &preIndex, const std::vector<int64_t> &shape, int dimNum) {
// shape:[7,3,10,17] and last index:[0,0,9,16] -> next index:[0,1,0,0]
constexpr int64_t carryBit = 1;
for (int i = dimNum - 1; i >= 0; i--) {
preIndex[i] = preIndex[i] + carryBit;
if (preIndex[i] < shape[i]) {
break;
} else {
preIndex[i] = preIndex[i] - shape[i];
}
}
return;
}
} // namespace
namespace aicpu {
uint32_t MaskedSelectCpuKernel::Compute(const CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kMaskedSelectInputNum, kMaskedSelectOutputNum), "[%s] check params failed.",
kMaskedSelect);
// choose compute function depend on dataType
auto data_type0 = static_cast<DataType>(ctx.Input(kFirstInputIndex)->GetDataType());
auto data_type1 = static_cast<DataType>(ctx.Input(kSecondInputIndex)->GetDataType());
auto data_type2 = static_cast<DataType>(ctx.Output(kFirstOutputIndex)->GetDataType());
if (data_type1 != DT_BOOL) {
KERNEL_LOG_ERROR("[%s] Data type of mask requires bool, but got data type [%s].", ctx.GetOpType().c_str(),
DTypeStr(data_type1).c_str());
return static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
}
if (data_type0 != data_type2) {
KERNEL_LOG_ERROR("[%s] Data type of x and y requires same, but got data type [%s] and [%s].",
ctx.GetOpType().c_str(), DTypeStr(data_type0).c_str(), DTypeStr(data_type2).c_str());
return static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
}
switch (data_type0) {
case DT_FLOAT16:
return MaskedSelectCompute<Eigen::half>(ctx);
case DT_FLOAT:
return MaskedSelectCompute<float>(ctx);
case DT_DOUBLE:
return MaskedSelectCompute<double>(ctx);
case DT_INT8:
return MaskedSelectCompute<int8_t>(ctx);
case DT_INT16:
return MaskedSelectCompute<int16_t>(ctx);
case DT_INT32:
return MaskedSelectCompute<int32_t>(ctx);
case DT_INT64:
return MaskedSelectCompute<int64_t>(ctx);
case DT_UINT8:
return MaskedSelectCompute<uint8_t>(ctx);
case DT_UINT16:
return MaskedSelectCompute<uint16_t>(ctx);
case DT_UINT32:
return MaskedSelectCompute<uint32_t>(ctx);
case DT_UINT64:
return MaskedSelectCompute<uint64_t>(ctx);
case DT_BOOL:
return MaskedSelectCompute<bool>(ctx);
default:
KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
DTypeStr(data_type0).c_str());
return static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
}
}
template <typename T>
uint32_t MaskedSelectCpuKernel::ParallelCompute(const CpuKernelContext &ctx, const std::vector<int64_t> &inputShapeX,
const std::vector<int64_t> &inputShapeMask,
const std::vector<int64_t> &outputShape, int64_t dataNum) {
T *x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
bool *mask = reinterpret_cast<bool *>(ctx.Input(1)->GetData());
T *y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
std::atomic<int> threadNum{0};
std::atomic<bool> taskFlag(true);
constexpr int queueLen = 100;
std::array<OutputInfo, queueLen> outputIndexList;
std::vector<int64_t> indexStrideX = CalIndexStride(inputShapeX, outputShape);
std::vector<int64_t> indexStrideMask = CalIndexStride(inputShapeMask, outputShape);
std::vector<int64_t> indexStrideOutput = CalIndexStride(outputShape);
KERNEL_LOG_DEBUG("index stride of x[%s].", VectorToString(indexStrideX).c_str());
KERNEL_LOG_DEBUG("index stride of mask[%s].", VectorToString(indexStrideMask).c_str());
auto work = [=, &threadNum, &taskFlag, &outputIndexList](int64_t start, int64_t end) {
int64_t cnt = 0;
int dimNum = outputShape.size();
std::vector<int64_t> indexValue(dimNum, 0);
if (!CalIndexInfo(indexStrideOutput, start, indexValue, dimNum)) {
taskFlag.store(false);
KERNEL_LOG_ERROR("Invalid index stride, please check.");
return;
}
for (int64_t i = start; i < end; ++i) {
int64_t maskFlatIndex = CalFlattenIndex(indexStrideMask, indexValue, dimNum);
int64_t xFlatIndex = CalFlattenIndex(indexStrideX, indexValue, dimNum);
if (mask[maskFlatIndex]) {
y[start + cnt] = x[xFlatIndex];
cnt++;
}
UpdateIndexByCarry(indexValue, outputShape, dimNum);
}
int idx = threadNum.fetch_add(1, std::memory_order_relaxed);
if (idx >= queueLen) {
taskFlag.store(false);
return;
}
outputIndexList[idx].startIdx = start;
outputIndexList[idx].len = cnt;
KERNEL_LOG_DEBUG("outputIndexList[%d] startIdx is [%lld], len is [%lld].", idx, outputIndexList[idx].startIdx,
outputIndexList[idx].len);
};
constexpr int perUnitSize = 1000;
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, dataNum, perUnitSize, work), "MaskedSelect calculate failed.");
if (!taskFlag.load()) {
KERNEL_LOG_ERROR("Invalid array.");
return static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
}
int validNum = threadNum.load();
std::sort(outputIndexList.begin(), outputIndexList.begin() + validNum, CompareFunc);
int validOffset = outputIndexList[0].len;
int64_t copyLen = 0;
int ret = 0;
for (int i = 1; i < validNum; i++) {
copyLen = outputIndexList[i].len;
if (copyLen <= 0) {
continue;
}
int64_t byteLen = copyLen * static_cast<int64_t>(sizeof(T));
ret = memmove_s(y + validOffset, byteLen, y + outputIndexList[i].startIdx, byteLen);
KERNEL_CHECK_FALSE((ret == EOK), KERNEL_STATUS_PARAM_INVALID, "Memmove failed, result = [%d].", ret);
validOffset += copyLen;
}
ctx.Output(0)->GetTensorShape()->SetDimSizes({validOffset});
return static_cast<uint32_t>(KERNEL_STATUS_OK);
}
template <typename T>
uint32_t MaskedSelectCpuKernel::MaskedSelectCompute(const CpuKernelContext &ctx) {
T *x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
KERNEL_CHECK_NULLPTR(x, static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID), "[%s] get input_data[0] failed.",
kMaskedSelect);
bool *mask = reinterpret_cast<bool *>(ctx.Input(1)->GetData());
KERNEL_CHECK_NULLPTR(mask, static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID), "[%s] get input_data[1] failed.",
kMaskedSelect);
T *y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
KERNEL_CHECK_NULLPTR(y, static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID), "[%s] get output_data[0] failed.",
kMaskedSelect);
auto input_shape_a = ctx.Input(0)->GetTensorShape()->GetDimSizes();
auto input_shape_b = ctx.Input(1)->GetTensorShape()->GetDimSizes();
if (IsScalar(input_shape_a) && IsScalar(input_shape_b)) {
if (mask[0]) {
y[0] = x[0];
ctx.Output(0)->GetTensorShape()->SetDimSizes({1});
} else {
ctx.Output(0)->GetTensorShape()->SetDimSizes({0});
}
return static_cast<uint32_t>(KERNEL_STATUS_OK);
}
std::vector<int64_t> output_shape;
auto ret = GetBroadcastShape(input_shape_a, input_shape_b, &output_shape);
KERNEL_CHECK_FALSE(ret == KERNEL_STATUS_OK, static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID),
"Shape of x and mask can't be broadcast.");
int64_t tensor_size = 1;
for (const int64_t &d : output_shape) {
tensor_size *= d;
}
if (tensor_size >= kParallelDataNums) {
ret = ParallelCompute<T>(ctx, input_shape_a, input_shape_b, output_shape, tensor_size);
return ret;
}
int64_t j = 0;
BroadcastIterator iter(input_shape_a, input_shape_b, &output_shape);
iter.SetPos(0);
for (int64_t i = 0; i < tensor_size; ++i) {
if (mask[iter.GetInputPosB()]) {
y[j++] = x[iter.GetInputPosA()];
}
iter.GenNextPos();
}
ctx.Output(0)->GetTensorShape()->SetDimSizes({j});
return static_cast<uint32_t>(KERNEL_STATUS_OK);
}
REGISTER_CPU_KERNEL(kMaskedSelect, MaskedSelectCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,41 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_MASKED_SELECT_H_
#define AICPU_KERNELS_NORMALIZED_MASKED_SELECT_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class MaskedSelectCpuKernel : public CpuKernel {
public:
~MaskedSelectCpuKernel() = default;
uint32_t Compute(const CpuKernelContext &ctx) override;
private:
/**
* @brief compute for all types
* @param ctx cpu kernel context
* @return status if success
*/
template <typename T>
uint32_t MaskedSelectCompute(const CpuKernelContext &ctx);
template <typename T>
uint32_t ParallelCompute(const CpuKernelContext &ctx, const std::vector<int64_t> &inputShapeX,
const std::vector<int64_t> &inputShapeMask, const std::vector<int64_t> &outputShape,
int64_t dataNum);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,121 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "masked_select_grad.h"
#include "Eigen/Core"
#include "securec.h"
#include "cpu_types.h"
#include "kernel_log.h"
#include "status.h"
#include "utils/broadcast_iterator.h"
#include "utils/kernel_util.h"
namespace {
constexpr uint32_t kMaskedSelectGradInputNum = 3;
constexpr uint32_t kMaskedSelectGradOutputNum = 1;
const char *const kMaskedSelectGrad = "MaskedSelectGrad";
} // namespace
namespace aicpu {
uint32_t MaskedSelectGradCpuKernel::Compute(const CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kMaskedSelectGradInputNum, kMaskedSelectGradOutputNum),
"[%s] check params failed.", kMaskedSelectGrad);
// choose compute function depend on dataType
auto data_type0 = static_cast<DataType>(ctx.Input(kFirstInputIndex)->GetDataType());
auto data_type1 = static_cast<DataType>(ctx.Input(kSecondInputIndex)->GetDataType());
auto data_type2 = static_cast<DataType>(ctx.Input(2)->GetDataType());
if (data_type1 != DT_BOOL) {
KERNEL_LOG_ERROR("[%s] Data type of mask requires bool, but got data type [%s].", ctx.GetOpType().c_str(),
DTypeStr(data_type1).c_str());
return static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
}
if (data_type0 != data_type2) {
KERNEL_LOG_ERROR("[%s] Data type of x and y requires same, but got data type [%s] and [%s].",
ctx.GetOpType().c_str(), DTypeStr(data_type0).c_str(), DTypeStr(data_type2).c_str());
return static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
}
switch (data_type0) {
case DT_FLOAT16:
return MaskedSelectGradCompute<Eigen::half>(ctx);
case DT_FLOAT:
return MaskedSelectGradCompute<float>(ctx);
case DT_DOUBLE:
return MaskedSelectGradCompute<double>(ctx);
case DT_INT8:
return MaskedSelectGradCompute<int8_t>(ctx);
case DT_INT16:
return MaskedSelectGradCompute<int16_t>(ctx);
case DT_INT32:
return MaskedSelectGradCompute<int32_t>(ctx);
case DT_INT64:
return MaskedSelectGradCompute<int64_t>(ctx);
case DT_UINT8:
return MaskedSelectGradCompute<uint8_t>(ctx);
case DT_UINT16:
return MaskedSelectGradCompute<uint16_t>(ctx);
case DT_UINT32:
return MaskedSelectGradCompute<uint32_t>(ctx);
case DT_UINT64:
return MaskedSelectGradCompute<uint64_t>(ctx);
case DT_BOOL:
return MaskedSelectGradCompute<bool>(ctx);
default:
KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
DTypeStr(data_type0).c_str());
return static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
}
}
template <typename T>
uint32_t MaskedSelectGradCpuKernel::MaskedSelectGradCompute(const CpuKernelContext &ctx) {
bool *mask = reinterpret_cast<bool *>(ctx.Input(1)->GetData());
KERNEL_CHECK_NULLPTR(mask, static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID), "[%s] get input_data[1] failed.",
kMaskedSelectGrad);
T *grad = reinterpret_cast<T *>(ctx.Input(2)->GetData());
KERNEL_CHECK_NULLPTR(grad, static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID), "[%s] get input_data[2] failed.",
kMaskedSelectGrad);
T *dx = reinterpret_cast<T *>(ctx.Output(0)->GetData());
KERNEL_CHECK_NULLPTR(dx, static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID), "[%s] get output_data[0] failed.",
kMaskedSelectGrad);
auto input_shape_a = ctx.Input(0)->GetTensorShape()->GetDimSizes();
auto input_shape_b = ctx.Input(1)->GetTensorShape()->GetDimSizes();
std::vector<int64_t> output_shape;
auto ret = GetBroadcastShape(input_shape_a, input_shape_b, &output_shape);
KERNEL_CHECK_FALSE(ret == KERNEL_STATUS_OK, KERNEL_STATUS_PARAM_INVALID, "Shape of x and mask can't be broadcast.");
int64_t tensor_size = 1;
for (const int64_t &d : output_shape) {
tensor_size *= d;
}
const T NUM_ZERO = static_cast<T>(0);
for (int k = 0; k < tensor_size; ++k) {
dx[k] = NUM_ZERO;
}
int64_t j = 0;
BroadcastIterator iter(input_shape_a, input_shape_b, &output_shape);
iter.SetPos(0);
for (int64_t i = 0; i < tensor_size; ++i) {
if (mask[iter.GetInputPosB()]) {
dx[iter.GetInputPosA()] += grad[j++];
}
iter.GenNextPos();
}
return static_cast<uint32_t>(KERNEL_STATUS_OK);
}
REGISTER_CPU_KERNEL(kMaskedSelectGrad, MaskedSelectGradCpuKernel);
} // namespace aicpu

View File

@ -1,5 +1,5 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -13,16 +13,25 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_MASKED_SELECT_GRAD_H_
#define AICPU_KERNELS_NORMALIZED_MASKED_SELECT_GRAD_H_
#ifndef AICPU_KERNELS_NORMALIZED_ACOS_H
#define AICPU_KERNELS_NORMALIZED_ACOS_H
#include "cpu_kernel/inc/cpu_ops_kernel.h"
#include "cpu_ops_kernel.h"
namespace aicpu {
class AcosCpuKernel final : public CpuKernel {
class MaskedSelectGradCpuKernel : public CpuKernel {
public:
std::uint32_t Compute(const CpuKernelContext &ctx) override;
~MaskedSelectGradCpuKernel() = default;
uint32_t Compute(const CpuKernelContext &ctx) override;
private:
/**
* @brief compute for all types
* @param ctx cpu kernel context
* @return status if success
*/
template <typename T>
uint32_t MaskedSelectGradCompute(const CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,137 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "nms_with_mask.h"
#include <numeric>
#include "Eigen/Core"
#include "utils/kernel_util.h"
namespace {
const int32_t kInputNum = 1;
const int32_t kOutputNum = 3;
const int kColNum5 = 5;
const int kColNum8 = 8;
const char *kNMSWithMask = "NMSWithMask";
} // namespace
namespace aicpu {
uint32_t NMSWithMaskCpuKernel::Compute(const CpuKernelContext &ctx) {
// check param
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "NMSWithMask check input or output is failed");
AttrValue *iou_threshold = ctx.GetAttr("iou_threshold");
KERNEL_CHECK_FALSE((iou_threshold != nullptr), KERNEL_STATUS_PARAM_INVALID, "Get attr [iou_threshold] failed.");
iou_value_ = iou_threshold->GetFloat();
Tensor *input_data = ctx.Input(0);
auto data_type = input_data->GetDataType();
KERNEL_CHECK_FALSE((data_type == DT_FLOAT || data_type == DT_FLOAT16), KERNEL_STATUS_PARAM_INVALID,
"Input[0] data type[%s] is unsupported", DTypeStr(data_type).c_str());
auto input_shape = input_data->GetTensorShape()->GetDimSizes();
num_input_ = input_shape[0]; // Get N values in [N, 5] data.
box_size_ = input_shape[1];
if (box_size_ != kColNum5 && box_size_ != kColNum8) {
KERNEL_LOG_INFO("NMSWithMask the col number of input[0] must be [%d] or [%d], but got [%d]!", kColNum5, kColNum8,
box_size_);
return KERNEL_STATUS_PARAM_INVALID;
}
uint32_t res;
switch (data_type) {
case DT_FLOAT16:
res = DoCompute<Eigen::half>(ctx);
break;
case DT_FLOAT:
res = DoCompute<float>(ctx);
break;
default:
KERNEL_LOG_INFO("NMSWithMask input[0] only support type[DT_FLOAT16, DT_FLOAT], but got type[%s]",
DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
break;
}
return res;
}
template <typename T>
uint32_t NMSWithMaskCpuKernel::DoCompute(const CpuKernelContext &ctx) {
auto input = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto output = reinterpret_cast<T *>(ctx.Output(OUTPUT)->GetData());
auto sel_idx = reinterpret_cast<int *>(ctx.Output(SEL_IDX)->GetData());
auto sel_boxes = reinterpret_cast<bool *>(ctx.Output(SEL_BOXES)->GetData());
std::fill(&sel_idx[0], &sel_idx[num_input_], 0);
std::fill(&sel_boxes[0], &sel_boxes[num_input_], false);
const int box_size = box_size_;
const auto comp = [input, box_size](const size_t a, const size_t b) {
const size_t index_a = a * box_size + 4;
const size_t index_b = b * box_size + 4;
if (input[index_b] == input[index_a]) {
return a < b;
};
return input[index_b] < input[index_a];
};
std::vector<int> order(num_input_);
std::iota(order.begin(), order.end(), 0);
std::sort(order.begin(), order.end(), comp);
std::vector<T> areas(num_input_);
for (int64_t i = 0; i < num_input_; i++) {
areas[i] =
(input[i * box_size_ + 2] - input[i * box_size_]) * (input[i * box_size_ + 3] - input[i * box_size_ + 1]);
}
int64_t num_to_keep = 0;
for (int64_t _i = 0; _i < num_input_; _i++) {
auto i = order[_i];
if (sel_boxes[i] == 1) continue;
sel_idx[num_to_keep++] = i;
auto ix1 = input[i * box_size_];
auto iy1 = input[i * box_size_ + 1];
auto ix2 = input[i * box_size_ + 2];
auto iy2 = input[i * box_size_ + 3];
for (int64_t _j = _i + 1; _j < num_input_; _j++) {
auto j = order[_j];
if (sel_boxes[j] == 1) continue;
auto xx1 = std::max(ix1, input[j * box_size_]);
auto yy1 = std::max(iy1, input[j * box_size_ + 1]);
auto xx2 = std::min(ix2, input[j * box_size_ + 2]);
auto yy2 = std::min(iy2, input[j * box_size_ + 3]);
auto w = std::max(static_cast<T>(0), xx2 - xx1);
auto h = std::max(static_cast<T>(0), yy2 - yy1);
auto inter = w * h;
auto ovr = inter / (areas[i] + areas[j] - inter);
if (static_cast<float>(ovr) > iou_value_) {
sel_boxes[j] = 1;
}
}
}
for (int k = 0; k < num_input_; ++k) {
for (int j = 0; j < box_size_; ++j) {
if (k < num_to_keep) {
output[k * kColNum5 + j] = input[sel_idx[k] * box_size_ + j];
sel_boxes[k] = true;
} else {
output[k * kColNum5 + j] = static_cast<T>(0);
sel_boxes[k] = false;
}
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kNMSWithMask, NMSWithMaskCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,48 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_NMS_WITH_MASK_H
#define AICPU_KERNELS_NORMALIZED_NMS_WITH_MASK_H
#include <vector>
#include <algorithm>
#include <limits>
#include <memory>
#include <string>
#include <vector>
#include "cpu_ops_kernel.h"
#include "cpu_kernel_utils.h"
#include "kernel_log.h"
#include "securec.h"
namespace aicpu {
class NMSWithMaskCpuKernel : public CpuKernel {
public:
NMSWithMaskCpuKernel() = default;
~NMSWithMaskCpuKernel() override = default;
uint32_t Compute(const CpuKernelContext &ctx) override;
private:
template <typename T>
uint32_t DoCompute(const CpuKernelContext &ctx);
int num_input_{0};
float iou_value_{0.0};
size_t ceil_power_2{0};
int box_size_ = 5; // pre_defined box width
enum output_list_ { OUTPUT, SEL_IDX, SEL_BOXES };
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,265 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "reduce_sum.h"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kReduceSumInputNum = 2;
const uint32_t kReduceSumOutputNum = 1;
const char *const kReduceSum = "ReduceSum";
#define REDUCESUM_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = ReduceSumCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("ReduceSum kernel compute failed."); \
return result; \
} \
break; \
}
#define REDUCESUM_COMPUTE_CASE_COMPLEX(DTYPE, TYPE, IN_TYPE, CTX) \
case (DTYPE): { \
uint32_t result = ReduceSumCompute2<TYPE, IN_TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("ReduceSum kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t ReduceSumCpuKernel::Compute(const CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kReduceSumInputNum, kReduceSumOutputNum), "[%s] check input and output failed.",
kReduceSum);
KERNEL_HANDLE_ERROR(ReduceSumCheck(ctx), "[%s] check params failed.", kReduceSum);
auto input_data_type = ctx.Input(0)->GetDataType();
switch (input_data_type) {
REDUCESUM_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
REDUCESUM_COMPUTE_CASE(DT_FLOAT, float, ctx)
REDUCESUM_COMPUTE_CASE(DT_DOUBLE, double, ctx)
REDUCESUM_COMPUTE_CASE(DT_INT8, int8_t, ctx)
REDUCESUM_COMPUTE_CASE(DT_INT16, int16_t, ctx)
REDUCESUM_COMPUTE_CASE(DT_INT32, int32_t, ctx)
REDUCESUM_COMPUTE_CASE(DT_INT64, int64_t, ctx)
REDUCESUM_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
REDUCESUM_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
REDUCESUM_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
REDUCESUM_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
REDUCESUM_COMPUTE_CASE_COMPLEX(DT_COMPLEX64, std::complex<float>, float, ctx)
REDUCESUM_COMPUTE_CASE_COMPLEX(DT_COMPLEX128, std::complex<double>, double, ctx)
default:
KERNEL_LOG_ERROR("ReduceSum kernel data type [%s] not support.", DTypeStr(input_data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t ReduceSumCpuKernel::ReduceSumCheck(const CpuKernelContext &ctx) const {
KERNEL_CHECK_NULLPTR(ctx.Input(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "get input failed.");
KERNEL_CHECK_NULLPTR(ctx.Input(0)->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Get input tensor shape failed.");
KERNEL_CHECK_NULLPTR(ctx.Output(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "get output failed.");
if (ctx.Input(1)->GetData() != nullptr) {
KERNEL_CHECK_FALSE((ctx.Input(1)->GetDataType() == DT_INT32 || ctx.Input(1)->GetDataType() == DT_INT64),
KERNEL_STATUS_PARAM_INVALID, "Data type of axis is not support, axis data type is [%u].",
ctx.Input(1)->GetDataType());
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t ReduceSumCpuKernel::ReduceSumCompute(const CpuKernelContext &ctx) {
std::vector<int64_t> input_shape = ctx.Input(0)->GetTensorShape()->GetDimSizes();
auto input_data = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto output_data = reinterpret_cast<T *>(ctx.Output(0)->GetData());
if (input_shape.size() == 0) {
output_data[0] = input_data[0];
return KERNEL_STATUS_OK;
}
auto axes_data = reinterpret_cast<int32_t *>(ctx.Input(1)->GetData());
if (axes_data == nullptr) {
int64_t data_num = ctx.Input(0)->NumElements();
auto accumulator = static_cast<T>(0);
for (int64_t i = 0; i < data_num; i++) {
accumulator += input_data[i];
}
output_data[0] = accumulator;
return KERNEL_STATUS_OK;
}
std::vector<int64_t> axes;
KERNEL_HANDLE_ERROR(ReduceSumDedupAxes(ctx, axes), "ReduceSum deduplicate failed.");
int64_t output_num = ctx.Output(0)->NumElements();
uint32_t axes_idx = 0;
KERNEL_HANDLE_ERROR(ReduceSumOneAxes<T>(input_data, input_shape, output_data, output_num, axes, axes_idx),
"Reduce sum compute failed.");
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t ReduceSumCpuKernel::ReduceSumOneAxes(const T *input_data, std::vector<int64_t> &input_shape, T *output_data,
int64_t output_num, std::vector<int64_t> &axes, uint32_t &axes_idx) {
if (axes_idx >= axes.size()) {
for (int64_t i = 0; i < output_num; i++) {
output_data[i] = input_data[i];
}
return KERNEL_STATUS_OK;
}
int64_t inner = 1, outer = 1, depth = 1;
KERNEL_HANDLE_ERROR(ReduceSumParseAxes(input_shape, axes, axes_idx, inner, outer, depth), "parse axes failed.");
auto output_data_temp = new (std::nothrow) T[inner * outer];
KERNEL_CHECK_NULLPTR(output_data_temp, KERNEL_STATUS_INNER_ERROR, "apply memory failed.");
for (int64_t outer_index = 0; outer_index < outer; ++outer_index) {
for (int64_t inner_index = 0; inner_index < inner; inner_index++) {
auto accumulator = static_cast<T>(0);
for (int64_t depth_index = 0; depth_index < depth; depth_index++) {
int64_t index = outer_index;
index += depth_index * outer;
index += inner_index * depth * outer;
accumulator += input_data[index];
}
int64_t output_index = outer_index;
output_index += inner_index * outer;
output_data_temp[output_index] = accumulator;
}
}
uint32_t result = ReduceSumOneAxes<T>(output_data_temp, input_shape, output_data, output_num, axes, axes_idx);
if (output_data_temp != nullptr) {
delete[] output_data_temp;
}
return result;
}
template <typename T, typename T2>
uint32_t ReduceSumCpuKernel::ReduceSumCompute2(const CpuKernelContext &ctx) {
std::vector<int64_t> input_shape = ctx.Input(0)->GetTensorShape()->GetDimSizes();
auto input_data = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto output_data = reinterpret_cast<T *>(ctx.Output(0)->GetData());
if (input_shape.size() == 0) {
output_data[0] = std::complex<T2>(input_data[0].real(), input_data[0].imag());
return KERNEL_STATUS_OK;
}
auto axes_data = reinterpret_cast<int32_t *>(ctx.Input(1)->GetData());
int64_t input_num = ctx.Input(0)->NumElements();
if (axes_data == nullptr) {
auto accumulator_real = static_cast<T2>(0);
auto accumulator_imag = static_cast<T2>(0);
for (int64_t i = 0; i < input_num; i++) {
accumulator_real += input_data[i].real();
accumulator_imag += input_data[i].imag();
}
output_data[0] = std::complex<T2>(accumulator_real, accumulator_imag);
return KERNEL_STATUS_OK;
}
std::vector<int64_t> axes;
KERNEL_HANDLE_ERROR(ReduceSumDedupAxes(ctx, axes), "ReduceSum deduplicate failed.");
int64_t output_num = ctx.Output(0)->NumElements();
uint32_t axes_idx = 0;
KERNEL_HANDLE_ERROR(
(ReduceSumOneAxes2<T, T2>(input_data, input_num, input_shape, output_data, output_num, axes, axes_idx)),
"Reduce sum compute failed.");
return KERNEL_STATUS_OK;
}
template <typename T, typename T2>
uint32_t ReduceSumCpuKernel::ReduceSumOneAxes2(const T *input_data, int64_t input_num, std::vector<int64_t> input_shape,
T *output_data, int64_t output_num, std::vector<int64_t> &axes,
uint32_t &axes_idx) {
if (axes_idx >= axes.size()) {
auto accumulator_real = static_cast<T2>(0);
auto accumulator_imag = static_cast<T2>(0);
for (int64_t i = 0; i < output_num; i++) {
accumulator_real = input_data[i].real();
accumulator_imag = input_data[i].imag();
output_data[i] = std::complex<T2>(accumulator_real, accumulator_imag);
}
return KERNEL_STATUS_OK;
}
int64_t inner = 1, outer = 1, depth = 1;
KERNEL_HANDLE_ERROR(ReduceSumParseAxes(input_shape, axes, axes_idx, inner, outer, depth), "parse axes failed.");
std::vector<T2> input_data_real(input_num);
std::vector<T2> input_data_imag(input_num);
for (int64_t i = 0; i < input_num; i++) {
input_data_real[i] = input_data[i].real();
input_data_imag[i] = input_data[i].imag();
}
int64_t output_num_temp = inner * outer;
auto *output_data_temp = new (std::nothrow) T[output_num_temp];
KERNEL_CHECK_NULLPTR(output_data_temp, KERNEL_STATUS_INNER_ERROR, "apply memory failed.");
for (int64_t outer_index = 0; outer_index < outer; outer_index++) {
for (int64_t inner_index = 0; inner_index < inner; inner_index++) {
auto accumulator_real = static_cast<T2>(0);
auto accumulator_imag = static_cast<T2>(0);
for (int64_t depth_index = 0; depth_index < depth; depth_index++) {
int64_t index = outer_index;
index += inner_index * depth * outer;
index += depth_index * outer;
accumulator_real += input_data_real[index];
accumulator_imag += input_data_imag[index];
}
int64_t output_index = outer_index;
output_index += inner_index * outer;
output_data_temp[output_index] = std::complex<T2>(accumulator_real, accumulator_imag);
}
}
uint32_t result =
ReduceSumOneAxes2<T, T2>(output_data_temp, output_num_temp, input_shape, output_data, output_num, axes, axes_idx);
if (output_data_temp != nullptr) {
delete[] output_data_temp;
}
return result;
}
uint32_t ReduceSumCpuKernel::ReduceSumDedupAxes(const CpuKernelContext &ctx, std::vector<int64_t> &axes) {
int32_t rank = ctx.Input(0)->GetTensorShape()->GetDims();
auto axes_data = reinterpret_cast<int32_t *>(ctx.Input(1)->GetData());
int64_t axes_num = ctx.Input(1)->NumElements();
for (int64_t i = 0; i < axes_num; i++) {
int32_t axis = axes_data[i];
KERNEL_CHECK_FALSE((axis < rank) && (axis >= -rank), KERNEL_STATUS_PARAM_INVALID,
"axes[%d] is out of input dims rank[%d]", axis, rank);
if (axis < 0) {
axis += rank;
}
axes.push_back(axis);
}
int64_t j = 1;
while (j < axes_num) {
std::vector<int64_t>::iterator iter = find(axes.begin(), axes.begin() + j, axes[j]);
if (iter != axes.begin() + j) {
axes.erase(iter);
axes_num--;
} else {
j++;
}
}
return KERNEL_STATUS_OK;
}
uint32_t ReduceSumCpuKernel::ReduceSumParseAxes(std::vector<int64_t> &input_shape, std::vector<int64_t> &axes,
uint32_t &axes_idx, int64_t &inner, int64_t &outer,
int64_t &depth) const {
int64_t axis = axes[axes_idx];
axes_idx++;
int64_t rank = input_shape.size();
for (int64_t i = 0; i < rank; i++) {
if (i < axis) {
inner *= input_shape[i];
} else if (i > axis) {
outer *= input_shape[i];
} else {
depth = input_shape[i];
input_shape[i] = 1;
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kReduceSum, ReduceSumCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,53 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_REDUCE_SUM_H
#define AICPU_KERNELS_NORMALIZED_REDUCE_SUM_H
#include "cpu_ops_kernel.h"
namespace aicpu {
class ReduceSumCpuKernel : public CpuKernel {
public:
ReduceSumCpuKernel() = default;
~ReduceSumCpuKernel() override = default;
uint32_t Compute(const CpuKernelContext &ctx) override;
private:
uint32_t ReduceSumCheck(const CpuKernelContext &ctx) const;
template <typename T>
uint32_t ReduceSumCompute(const CpuKernelContext &ctx);
template <typename T>
uint32_t ReduceSumOneAxes(const T *input_data, std::vector<int64_t> &input_shape, T *output_data, int64_t output_num,
std::vector<int64_t> &axes, uint32_t &axes_idx);
template <typename T, typename T2>
uint32_t ReduceSumCompute2(const CpuKernelContext &ctx);
template <typename T, typename T2>
uint32_t ReduceSumOneAxes2(const T *input_data, int64_t input_num, std::vector<int64_t> input_shape, T *output_data,
int64_t output_num, std::vector<int64_t> &axes, uint32_t &axes_idx);
uint32_t ReduceSumDedupAxes(const CpuKernelContext &ctx, std::vector<int64_t> &axes);
uint32_t ReduceSumParseAxes(std::vector<int64_t> &input_shape, std::vector<int64_t> &axes, uint32_t &axes_idx,
int64_t &inner, int64_t &outer, int64_t &depth) const;
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,60 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "sparse_group.h"
namespace aicpu {
void GroupIterable::IteratorStep::UpdateEndOfGroup() {
++next_loc_;
const auto &ix_t = iter_->ix_matrix_;
const int64_t N = ix_t.dimension(0);
while (next_loc_ < N && iter_->GroupMatches(ix_t, loc_, next_loc_)) {
++next_loc_;
}
}
bool GroupIterable::IteratorStep::operator!=(const IteratorStep &rhs) const { return (rhs.loc_ != loc_); }
bool GroupIterable::IteratorStep::operator==(const IteratorStep &rhs) const { return (rhs.loc_ == loc_); }
GroupIterable::IteratorStep &GroupIterable::IteratorStep::operator++() { // prefix ++
loc_ = next_loc_;
UpdateEndOfGroup();
return *this;
}
const GroupIterable::IteratorStep GroupIterable::IteratorStep::operator++(int) // postfix ++
{
IteratorStep lhs(*this);
++(*this);
return lhs;
}
Group GroupIterable::IteratorStep::operator*() const { return Group(iter_, loc_, next_loc_); }
std::vector<int64_t> Group::group() const {
std::vector<int64_t> g;
const auto &ix_t = iter_->ix_matrix_;
for (const int64_t d : iter_->group_dims_) {
g.push_back(ix_t(loc_, d));
}
return g;
}
TTypes<int64_t>::UnalignedConstMatrix Group::indices() const {
return TTypes<int64_t>::UnalignedConstMatrix(&(iter_->ix_matrix_(loc_, 0)), next_loc_ - loc_, iter_->dims_);
}
} // namespace aicpu

View File

@ -0,0 +1,154 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2020-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef CPU_KERNEL_UTIL_SPARSE_GROUP_ITERATOR_H_
#define CPU_KERNEL_UTIL_SPARSE_GROUP_ITERATOR_H_
#include <vector>
#include "eigen_tensor.h"
namespace aicpu {
class Group; // Predeclare Group for GroupIterable.
// ///////////////
// GroupIterable
// ///////////////
//
// Returned when calling sparse_tensor.group({dim0, dim1, ...}).
//
// Please note: the sparse_tensor should already be ordered according
// to {dim0, dim1, ...}. Otherwise this iteration will return invalid groups.
//
// Allows grouping and iteration of the SparseTensor according to the
// subset of dimensions provided to the group call.
//
// The actual grouping dimensions are stored in the
// internal vector group_dims_. Iterators inside the iterable provide
// the three methods:
//
// * group(): returns a vector with the current group dimension values.
// * indices(): a map of index, providing the indices in
// this group.
// * values(): a map of values, providing the values in
// this group.
//
// To iterate across GroupIterable, see examples in README.md.
//
// Forward declaration of SparseTensor
class GroupIterable {
public:
using VarDimArray = std::vector<int64_t>;
GroupIterable(Tensor *ix, Tensor *vals, int dims, const VarDimArray &group_dims)
: ix_(ix),
ix_matrix_(EigenTensor(ix, ix->GetData()).matrix<int64_t>()),
vals_(vals),
dims_(dims),
group_dims_(group_dims.begin(), group_dims.end()) {}
~GroupIterable() {}
class IteratorStep;
IteratorStep begin() { return IteratorStep(this, 0); }
IteratorStep at(int64_t loc) {
if (!(loc >= 0 && loc <= static_cast<int64_t>(ix_->GetTensorShape()->GetDimSize(0)))) {
KERNEL_LOG_WARN("loc should in [0, %d], but got: %d", ix_->GetTensorShape()->GetDimSize(0), loc);
}
return IteratorStep(this, loc);
}
IteratorStep end() { return IteratorStep(this, ix_->GetTensorShape()->GetDimSize(0)); }
template <typename TIX>
inline bool GroupMatches(const TIX &ix, int64_t loc_a, int64_t loc_b) const {
for (int64_t d : group_dims_) {
if (ix(loc_a, d) != ix(loc_b, d)) {
return false;
}
}
return true;
}
class IteratorStep {
public:
IteratorStep(GroupIterable *iter, int64_t loc) : iter_(iter), loc_(loc), next_loc_(loc_) { UpdateEndOfGroup(); }
~IteratorStep() { iter_ = nullptr; }
void UpdateEndOfGroup();
bool operator!=(const IteratorStep &rhs) const;
bool operator==(const IteratorStep &rhs) const;
IteratorStep &operator++();
const IteratorStep operator++(int);
Group operator*() const;
int64_t loc() const { return loc_; }
private:
GroupIterable *iter_;
int64_t loc_;
int64_t next_loc_;
};
private:
friend class Group;
Tensor *ix_;
TTypes<int64_t>::Matrix ix_matrix_;
Tensor *vals_;
const int dims_;
const std::vector<int64_t> group_dims_;
};
// This class is returned when dereferencing a GroupIterable iterator.
// It provides the methods group(), indices(), and values(), which
// provide access into the underlying SparseTensor.
class Group {
public:
Group(GroupIterable *iter, int64_t loc, int64_t next_loc) : iter_(iter), loc_(loc), next_loc_(next_loc) {}
~Group() { iter_ = NULL; }
std::vector<int64_t> group() const;
TTypes<int64_t>::UnalignedConstMatrix indices() const;
int64_t group_at(size_t index) const {
const auto &ix_t = iter_->ix_matrix_;
return ix_t(loc_, index);
}
template <typename T>
typename TTypes<T>::UnalignedVec values() const {
return typename TTypes<T>::UnalignedVec(&(EigenTensor(iter_->vals_, iter_->vals_->GetData()).vec<T>()(loc_)),
next_loc_ - loc_);
}
private:
GroupIterable *iter_;
int64_t loc_;
int64_t next_loc_;
};
} // namespace aicpu
#endif // CPU_KERNEL_UTIL_SPARSE_GROUP_ITERATOR_H_

View File

@ -0,0 +1,128 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "sparse_tensor.h"
#include "cpu_types.h"
namespace aicpu {
uint32_t SparseTensor::CreateSparseTensor(Tensor *ix, Tensor *tensorvals, std::vector<int64_t> shape,
std::vector<int64_t> order) {
KERNEL_LOG_INFO("Start to execute CreateSparseTensor.");
if (ix == nullptr || ix->GetData() == nullptr) {
KERNEL_LOG_ERROR("Ix is nullptr.");
return KERNEL_STATUS_INNER_ERROR;
}
if (tensorvals == nullptr || tensorvals->GetData() == nullptr) {
KERNEL_LOG_ERROR("Vals is nullptr.");
return KERNEL_STATUS_INNER_ERROR;
}
if (ix->GetTensorShape()->GetDims() > 2) {
KERNEL_LOG_ERROR("Index tensor dim size less than 2 or equal to 2, got size [%d] ",
ix->GetTensorShape()->GetDims());
return KERNEL_STATUS_INNER_ERROR;
}
int64_t dims = (ix->GetTensorShape()->GetDims() == 0) ? 1 : ix->GetTensorShape()->GetDimSize(0);
int64_t vals_dim0 = (tensorvals->GetTensorShape()->GetDims() == 0) ? 1 : tensorvals->GetTensorShape()->GetDimSize(0);
if (dims != vals_dim0) {
KERNEL_LOG_ERROR("Ix dim_size_0 [%ld] != tensorvals dim_size_0 [%ld]", dims, vals_dim0);
return KERNEL_STATUS_INNER_ERROR;
}
dims = ix->GetTensorShape()->GetDims() == 2 ? ix->GetTensorShape()->GetDimSize(1) : 1;
int64_t orderSize = static_cast<int64_t>(order.size());
int64_t shapeSize = static_cast<int64_t>(shape.size());
if (orderSize != dims) {
KERNEL_LOG_ERROR("orderSize [%ld] != dims [%ld]", orderSize, dims);
return KERNEL_STATUS_INNER_ERROR;
}
if (shapeSize != dims) {
KERNEL_LOG_ERROR("shapeSize [%ld] != dims [%ld]", shapeSize, dims);
return KERNEL_STATUS_INNER_ERROR;
}
ix_ = std::make_shared<EigenTensor>(ix, ix->GetData());
vals_ = std::make_shared<EigenTensor>(tensorvals, tensorvals->GetData());
if (ix_ == nullptr || vals_ == nullptr) {
KERNEL_LOG_ERROR("Indices or values create eigen tensor failed.");
return KERNEL_STATUS_INNER_ERROR;
}
shape_.assign(shape.begin(), shape.end());
order_.assign(order.begin(), order.end());
dims_ = static_cast<int32_t>(dims);
KERNEL_LOG_INFO("Execute CreateSparseTensor end");
return KERNEL_STATUS_OK;
}
uint32_t SparseTensor::IndicesValid(CpuKernelContext &ctx) const {
if (std::any_of(order_.begin(), order_.end(), [](int64_t ord) { return ord < 0; })) {
KERNEL_LOG_ERROR("Order was not provided.");
return KERNEL_STATUS_INNER_ERROR;
}
if (ix_->GetTensor()->GetDataType() == DT_INT32) {
if (EigenTensorIndicesValid<int32_t>(ctx) != KERNEL_STATUS_OK) {
KERNEL_LOG_ERROR("Indices valid failed.");
return KERNEL_STATUS_PARAM_INVALID;
}
} else {
if (EigenTensorIndicesValid<int64_t>(ctx) != KERNEL_STATUS_OK) {
KERNEL_LOG_ERROR("Indices valid failed.");
return KERNEL_STATUS_PARAM_INVALID;
}
}
return KERNEL_STATUS_OK;
}
bool SparseTensor::ValidateToDense(const Tensor *out) const {
KERNEL_LOG_INFO("Start execute ValidateToDense.");
if (out->GetDataType() != vals_->GetTensor()->GetDataType()) {
KERNEL_LOG_ERROR("Output data type must match vals, got out [%d], vals [%d].", out->GetDataType(),
vals_->GetTensor()->GetDataType());
return false;
}
if (out->GetTensorShape()->GetDims() != dims_) {
KERNEL_LOG_ERROR("Output dims must match idx, got output dims [%d], idx dims [%d].",
out->GetTensorShape()->GetDims(), dims_);
return false;
}
const auto out_shape = out->GetTensorShape();
int32_t shapeSize = static_cast<int32_t>(shape_.size());
if (shapeSize != out_shape->GetDims()) {
KERNEL_LOG_ERROR("output dims must match shape dims, got output dim [%d], shape dim [%d].", out_shape->GetDims(),
shapeSize);
return false;
}
for (size_t d = 0; d < shape_.size(); ++d) {
if (shape_[d] > out_shape->GetDimSize(static_cast<int32_t>(d))) {
KERNEL_LOG_ERROR(
"Valid output shape dims value failed, index [%zu], shape value [%ld], "
"greater than output shape value [%d].",
d, shape_[d], out_shape->GetDimSize(static_cast<int32_t>(d)));
return false;
}
}
KERNEL_LOG_INFO("Execute Validate dense end.");
return true;
}
GroupIterable SparseTensor::group(const std::vector<int64_t> &group_ix) const {
if (group_ix.size() > static_cast<size_t>(dims_)) {
KERNEL_LOG_WARN("Grop_ix.size:%zu > dims_:%d", group_ix.size(), dims_);
}
return GroupIterable(const_cast<Tensor *>(ix_->GetTensor()), const_cast<Tensor *>(vals_->GetTensor()), dims_,
group_ix);
}
} // namespace aicpu

View File

@ -0,0 +1,296 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2020-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_SPARSETENSOR_H
#define AICPU_SPARSETENSOR_H
#include <algorithm>
#include <memory>
#include "cpu_tensor.h"
#include "eigen_tensor.h"
#include "utils/kernel_util.h"
#include "cpu_kernel_utils.h"
#include "kernel_log.h"
#include "sparse_group.h"
#include "status.h"
namespace aicpu {
template <typename T>
const T SubtleMustCopy(const T &x) {
auto *to_x = reinterpret_cast<const volatile T *>(&x);
return *to_x;
}
} // namespace aicpu
namespace aicpu {
class SparseTensor {
public:
SparseTensor() : dims_(0) {}
~SparseTensor() = default;
/*
* create sparse tensor
* @param ix: index tensor
* @param tensorvals: tensorvals tensor
* @param shape: shape vec
* @param order: order vec
* @return uint32_t: 0->success other->failed
*/
uint32_t CreateSparseTensor(Tensor *ix, Tensor *tensorvals, std::vector<int64_t> shape, std::vector<int64_t> order);
/*
* sparse indices valid
* @return uint32_t: 0->success other->failed
*/
uint32_t IndicesValid(CpuKernelContext &ctx) const;
/*
* group sparse tensor
* @return GroupIterable
*/
GroupIterable group(const std::vector<int64_t> &group_ix) const;
/*
* sparse eigen tensor indices valid
* @return uint32_t: 0->success other->failed
*/
template <typename T>
uint32_t EigenTensorIndicesValidCheck(int64_t dims_size) const {
const auto ix_t = ix_->matrix<T>();
for (int64_t n = 1; n < dims_size; ++n) {
bool valid = true;
bool different = false;
bool increasing = true;
for (int32_t di = 0; di < dims_; ++di) {
if (ix_t(n, di) < 0 || ix_t(n, di) >= shape_[di]) {
valid = false;
}
int64_t diff = ix_t(n, order_[di]) - ix_t(n - 1, order_[di]);
if (diff > 0) {
different = true;
}
if (!different && diff < 0) {
increasing = false;
}
}
if (!valid) {
KERNEL_LOG_ERROR("Indices is out of bounds, index=%lld.", n);
return static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
}
if (!increasing) {
KERNEL_LOG_ERROR("indices is out of order, index=%lld.", n);
return static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
}
if (!different) {
KERNEL_LOG_ERROR("indices is repeated, index=%lld.", n);
return static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
}
}
return static_cast<uint32_t>(KERNEL_STATUS_OK);
}
/*
* sparse eigen tensor indices valid
* @return uint32_t: 0->success other->failed
*/
template <typename T>
uint32_t EigenTensorIndicesValidParaCheck(const CpuKernelContext &ctx, int64_t dims_size) const {
uint32_t min_core_num = 1;
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
uint32_t result = static_cast<uint32_t>(KERNEL_STATUS_OK);
(void)aicpu::CpuKernelUtils::ParallelFor(ctx, dims_size, dims_size / max_core_num,
[&](std::int64_t begin, std::int64_t end) {
int64_t start = begin;
if (begin == 0) {
start = begin + 1;
}
const auto ix_t = ix_->matrix<T>();
for (int64_t n = start; n < end; ++n) {
bool valid = true;
bool different = false;
bool increasing = true;
for (int32_t di = 0; di < dims_; ++di) {
if (ix_t(n, di) < 0 || ix_t(n, di) >= shape_[di]) {
valid = false;
}
int64_t diff = ix_t(n, order_[di]) - ix_t(n - 1, order_[di]);
if (diff > 0) {
different = true;
}
if (!different && diff < 0) {
increasing = false;
}
}
if (!valid) {
KERNEL_LOG_ERROR("Indices is out of bounds, index=%lld.", n);
result = static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
return;
}
if (!increasing) {
KERNEL_LOG_ERROR("indices is out of order, index=%lld.", n);
result = static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
return;
}
if (!different) {
KERNEL_LOG_ERROR("indices is repeated, index=%lld.", n);
result = static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
return;
}
}
});
return result;
}
/*
* sparse eigen tensor indices valid
* @return uint32_t: 0->success other->failed
*/
template <typename T>
uint32_t EigenTensorIndicesValid(const CpuKernelContext &ctx) const {
const auto ix_t = ix_->matrix<T>();
int64_t dims_size =
(ix_->GetTensor()->GetTensorShape()->GetDims() == 0) ? 1 : ix_->GetTensor()->GetTensorShape()->GetDimSize(0);
if (dims_size > 0) {
for (int32_t di = 0; di < dims_; ++di) {
if ((ix_t(0, di) < 0) || (ix_t(0, di) >= shape_[di])) {
KERNEL_LOG_ERROR("Indices is out of bounds, index=0.");
return KERNEL_STATUS_PARAM_INVALID;
}
}
}
const int64_t paralled_data_size = 16 * 1024;
if (dims_size < paralled_data_size) {
return EigenTensorIndicesValidCheck<T>(dims_size);
} else {
return EigenTensorIndicesValidParaCheck<T>(ctx, dims_size);
}
}
/*
* validate sparse to dense
* @param output: output tensor
* @return bool: true->success false->failed
*/
bool ValidateToDense(const Tensor *out) const;
/*
* sparse tensor to dense tensor
* @param output: output tensor
* @return uint32_t: 0->success other->failed
*/
template <typename IndiceT, typename ValueT>
uint32_t ToDenseParallel(const CpuKernelContext &ctx, Tensor *output) {
EigenTensor outputET(output, output->GetData());
auto output_t = outputET.flat<ValueT>();
auto ix_t = ix_->matrix<IndiceT>();
std::vector<int64_t> strides(dims_);
const auto &out_shape = output->GetTensorShape();
if (dims_ > 0) {
strides[dims_ - 1] = 1;
}
for (int32_t d = dims_ - 2; d >= 0; --d) {
strides[d] = strides[d + 1] * out_shape->GetDimSize(d + 1);
}
auto vals_t = vals_->vec<ValueT>();
int64_t vals_size = vals_t.dimension(0);
uint32_t min_core_num = 1;
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
uint32_t result = static_cast<uint32_t>(KERNEL_STATUS_OK);
auto parallel_proc = [&](std::int64_t begin, std::int64_t end) {
for (int64_t n = begin; n < end; ++n) {
bool invalid_dims = false;
int64_t ix = 0;
for (int d = 0; d < dims_; ++d) {
const int64_t ix_n_d = ix_t(n, d);
if (ix_n_d > out_shape->GetDimSize(d)) {
invalid_dims = true;
}
ix += strides[d] * ix_n_d;
}
if (invalid_dims) {
result = static_cast<uint32_t>(KERNEL_STATUS_INNER_ERROR);
KERNEL_LOG_ERROR("Sparse to dense got invalid dims.");
return;
}
output_t(ix) = vals_t(n);
}
return;
};
KERNEL_HANDLE_ERROR(aicpu::CpuKernelUtils::ParallelFor(ctx, vals_size, vals_size / max_core_num, parallel_proc),
"SparseToDense Compute failed.");
return result;
}
/*
* sparse tensor to dense tensor
* @param output: output tensor
* @return uint32_t: 0->success other->failed
*/
template <typename IndiceT, typename ValueT>
uint32_t ToDense(const CpuKernelContext &ctx, Tensor *output) {
KERNEL_LOG_INFO("Start to execute ToDense.");
if (output == nullptr || output->GetData() == nullptr) {
KERNEL_LOG_ERROR("Output tensor is nullptr.");
return KERNEL_STATUS_INNER_ERROR;
}
if (!ValidateToDense(output)) {
KERNEL_LOG_ERROR("Validate to dense param failed.");
return KERNEL_STATUS_INNER_ERROR;
}
auto vals_t = vals_->vec<ValueT>();
int64_t vals_size = vals_t.dimension(0);
const int64_t paralled_data_size = 16 * 1024;
if (vals_size >= paralled_data_size) {
return ToDenseParallel<IndiceT, ValueT>(ctx, output);
}
EigenTensor outputET(output, output->GetData());
auto output_t = outputET.flat<ValueT>();
auto ix_t = ix_->matrix<IndiceT>();
std::vector<int64_t> strides(dims_);
const auto &out_shape = output->GetTensorShape();
if (dims_ > 0) {
strides[dims_ - 1] = 1;
}
for (int32_t d = dims_ - 2; d >= 0; --d) {
strides[d] = strides[d + 1] * out_shape->GetDimSize(d + 1);
}
for (int64_t n = 0; n < vals_size; ++n) {
bool invalid_dims = false;
int64_t ix = 0;
for (int d = 0; d < dims_; ++d) {
const int64_t ix_n_d = ix_t(n, d);
if (ix_n_d > out_shape->GetDimSize(d)) {
invalid_dims = true;
}
ix += strides[d] * ix_n_d;
}
if (invalid_dims) {
KERNEL_LOG_ERROR("Sparse to dense got invalid dims.");
return KERNEL_STATUS_INNER_ERROR;
}
output_t(ix) = vals_t(n);
}
return KERNEL_STATUS_OK;
}
private:
std::shared_ptr<EigenTensor> ix_;
std::shared_ptr<EigenTensor> vals_;
std::vector<int64_t> shape_;
std::vector<int64_t> order_;
int32_t dims_;
};
} // namespace aicpu
#endif // AICPU_SPARSETENSOR_H

View File

@ -48,14 +48,18 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An
kSliceGradOpName,
kRandomShuffleOpName,
kRangeOpName};
static const std::set<std::string> kMigrateAicpuKernelOps = {
mindspore::kACosOpName,
mindspore::kLogMatrixDeterminantOpName,
mindspore::kAdaptiveAvgPool2dOpName,
mindspore::kAdaptiveAvgPool2dGradOpName,
mindspore::kMedianOpName,
mindspore::kMedianGradOpName,
};
static const std::set<std::string> kMigrateAicpuKernelOps = {mindspore::kACosOpName,
mindspore::kAdaptiveAvgPool2dOpName,
mindspore::kAdaptiveAvgPool2dGradOpName,
mindspore::kCacheSwapTableOpName,
mindspore::kFillOpName,
mindspore::kLogMatrixDeterminantOpName,
mindspore::kMaskedSelectOpName,
mindspore::kMaskedSelectGradOpName,
mindspore::kMedianOpName,
mindspore::kMedianGradOpName,
mindspore::kNMSWithMaskOpName,
mindspore::kReduceSumOpName};
static const std::string kEnvOpSoNames = "mindspore_aicpu_kernels";
static const std::string kCpuKernelSoName = "mindspore_cpu_kernels";