aicpu migration gp high priority
This commit is contained in:
parent
9f6e5709fa
commit
85f2032bf8
|
@ -74,3 +74,16 @@
|
|||
"mindspore/mindspore/lite/tools/kernel_builder/ascend/tbe_dsl/sample/op_proto/add_dsl.cc" "syntaxError"
|
||||
"mindspore/mindspore/lite/tools/kernel_builder/ascend/tbe_tik/sample/op_proto/matmul_tik.cc" "syntaxError"
|
||||
|
||||
# AICPU migration
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "useStlAlgorithm"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "variableScope"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "constParameter"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "constVariable"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "unreadVariable"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "redundantAssignment"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "constArgument"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "useStlAlgorithm"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "variableScope"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "constParameter"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "constVariable"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "unreadVariable"
|
|
@ -106,3 +106,28 @@
|
|||
"mindspore/mindspore/lite/tools/kernel_builder/ascend/tbe_dsl/sample/" "build/include"
|
||||
"mindspore/mindspore/lite/tools/kernel_builder/ascend/tbe_tik/sample/" "build/include"
|
||||
|
||||
# AICPU migration
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "build/include_subdir"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "runtime/references"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "build/include_what_you_use"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "whitespace/indent"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "whitespace/ending_newline"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "runtime/explicit"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "readability/braces"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "readability/namespace"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "whitespace/braces"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "build/include"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "whitespace/end_of_line"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "readability/casting"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "build/namespaces"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "build/include_subdir"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "runtime/references"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "build/include_what_you_use"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "whitespace/indent"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "whitespace/ending_newline"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "runtime/explicit"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "readability/braces"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "readability/namespace"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "whitespace/braces"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "build/include"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "whitespace/end_of_line"
|
|
@ -266,3 +266,6 @@ mindspore/mindspore/lite/tools/converter/quantizer/weight_quantizer.cc:mindspore
|
|||
mindspore/mindspore/python/mindspore/ops/function/nn_func.py:conv3d
|
||||
mindspore/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/matmul_avx512_mask_fp32.c:GemmRowxColMaskKernelFp32
|
||||
mindspore/mindspore/ccsrc/plugin/device/cpu/kernel/crop_and_resize_cpu_kernel.cc:mindspore::kernel::CropAndResizeCpuKernelMod::LaunchKernel
|
||||
|
||||
# AICPU migration
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/mediangrad.cc:aicpu::MedianGradCpuKernel::MedianGradCompute
|
|
@ -45,6 +45,8 @@ constexpr auto kAdamOpName = "Adam";
|
|||
constexpr auto kAdamWeightDecayName = "AdamWeightDecay";
|
||||
constexpr auto kAdaptiveMaxPool2dOpName = "AdaptiveMaxPool2d";
|
||||
constexpr auto kAdaptiveMaxPool2DOpName = "AdaptiveMaxPool2D";
|
||||
constexpr auto kAdaptiveAvgPool2dOpName = "AdaptiveAvgPool2d";
|
||||
constexpr auto kAdaptiveAvgPool2dGradOpName = "AdaptiveAvgPool2dGrad";
|
||||
constexpr auto kAdaptiveMaxPool3DGradOpName = "AdaptiveMaxPool3DGrad";
|
||||
constexpr auto kAddNOpName = "AddN";
|
||||
constexpr auto kAddOpName = "Add";
|
||||
|
@ -373,6 +375,7 @@ constexpr auto kLessOpName = "Less";
|
|||
constexpr auto kLinSpaceOpName = "LinSpace";
|
||||
constexpr auto kLinSpaceDOpName = "LinSpaceD";
|
||||
constexpr auto kListDiffOpName = "ListDiff";
|
||||
constexpr auto kLogMatrixDeterminantOpName = "LogMatrixDeterminant";
|
||||
constexpr auto kLogOpName = "Log";
|
||||
constexpr auto kLogSoftmaxOpName = "LogSoftmax";
|
||||
constexpr auto kLogSoftmaxV2OpName = "LogSoftmaxV2";
|
||||
|
@ -404,6 +407,8 @@ constexpr auto kMaxPoolV2OpName = "MaxPoolV2";
|
|||
constexpr auto kMaxPoolExt2OpName = "MaxPoolExt2";
|
||||
constexpr auto kMaxPoolWithArgmaxOpName = "MaxPoolWithArgmax";
|
||||
constexpr auto kMeanGradOpName = "MeanGrad";
|
||||
constexpr auto kMedianOpName = "Median";
|
||||
constexpr auto kMedianGradOpName = "MedianGrad";
|
||||
constexpr auto kMemCpyAsyncOpName = "memcpy_async";
|
||||
constexpr auto kMinimumGradOpName = "MinimumGrad";
|
||||
constexpr auto kMinimumOpName = "Minimum";
|
||||
|
|
|
@ -53,9 +53,9 @@ target_link_libraries(mindspore_cpu_kernels PRIVATE
|
|||
-pthread
|
||||
)
|
||||
|
||||
set(INSTALL_LIBRARY_DIR lib)
|
||||
set(INSTALL_LIBRARY_DIR lib/plugin)
|
||||
install(TARGETS mindspore_cpu_kernels OPTIONAL
|
||||
EXPORT mindspore_cpu_kernels-targets
|
||||
LIBRARY DESTINATION ${INSTALL_LIBRARY_DIR}
|
||||
LIBRARY DESTINATION ${INSTALL_LIBRARY_DIR}/ascend
|
||||
)
|
||||
|
||||
|
|
|
@ -0,0 +1,199 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "cpu_kernel/ms_kernel/adaptive_avg_pool_2d.h"
|
||||
|
||||
#include "cpu_kernel/common/cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace {
|
||||
const char *kAdaptiveAvgPool2d = "AdaptiveAvgPool2d";
|
||||
constexpr uint32_t kInputNum = 1;
|
||||
constexpr uint32_t kOutputNum = 1;
|
||||
constexpr int64_t kParallelDataNums = 4 * 1024;
|
||||
constexpr int64_t kthree = 3;
|
||||
constexpr int64_t kneg_three = -3;
|
||||
constexpr int64_t kfour = 4;
|
||||
constexpr int64_t ktwo = 2;
|
||||
constexpr int64_t kneg_two = -2;
|
||||
|
||||
template <typename SCALAR_T>
|
||||
struct AdaptiveCalcArgs {
|
||||
SCALAR_T *input_data = nullptr;
|
||||
SCALAR_T *output_data = nullptr;
|
||||
|
||||
int64_t size_b = 1;
|
||||
int64_t size_d = 0;
|
||||
int64_t in_size_h = 0;
|
||||
int64_t in_size_w = 0;
|
||||
|
||||
int64_t out_size_h = 0;
|
||||
int64_t out_size_w = 0;
|
||||
|
||||
int64_t in_stride_d = 0;
|
||||
int64_t in_stride_h = 0;
|
||||
int64_t in_stride_w = 0;
|
||||
};
|
||||
|
||||
#define SWITCH_PARALLEL(SHARD, end_num, num) \
|
||||
if ((num) <= kParallelDataNums) { \
|
||||
for (size_t i = 0; i < size_t(end_num); i++) { \
|
||||
SHARD(i, i + 1); \
|
||||
} \
|
||||
} else { \
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, end_num, 1, SHARD), \
|
||||
"AdaptiveAvgPool2d #SHARD Compute failed."); \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
template <typename SCALAR_T>
|
||||
SCALAR_T ComputeSum(int64_t span_h, int64_t span_w, SCALAR_T *in_point, AdaptiveCalcArgs<SCALAR_T> &args) {
|
||||
SCALAR_T sum = static_cast<SCALAR_T>(0.);
|
||||
for (int in_h = 0; in_h < span_h; in_h++) {
|
||||
for (int in_w = 0; in_w < span_w; in_w++) {
|
||||
SCALAR_T val = *(in_point + in_h * args.in_stride_h + in_w * args.in_stride_w);
|
||||
|
||||
sum += static_cast<SCALAR_T>(val);
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
template <typename SCALAR_T>
|
||||
void ComputeSingleThread(int64_t start, int64_t end, AdaptiveCalcArgs<SCALAR_T> args) {
|
||||
for (auto d = start; d < end; d++) {
|
||||
/* loop over output */
|
||||
for (int64_t out_h = 0; out_h < args.out_size_h; out_h++) {
|
||||
int in_start_h = StartIndex(out_h, args.out_size_h, args.in_size_h);
|
||||
int in_end_h = EndIndex(out_h, args.out_size_h, args.in_size_h);
|
||||
int span_h = in_end_h - in_start_h;
|
||||
|
||||
for (int64_t out_w = 0; out_w < args.out_size_w; out_w++) {
|
||||
int in_start_w = StartIndex(out_w, args.out_size_w, args.in_size_w);
|
||||
int in_end_w = EndIndex(out_w, args.out_size_w, args.in_size_w);
|
||||
int span_w = in_end_w - in_start_w;
|
||||
|
||||
// local pointers
|
||||
SCALAR_T *in_point =
|
||||
args.input_data + d * args.in_stride_d + in_start_h * args.in_stride_h + in_start_w * args.in_stride_w;
|
||||
SCALAR_T *out_point =
|
||||
args.output_data + d * args.out_size_h * args.out_size_w + out_h * args.out_size_w + out_w;
|
||||
|
||||
/* compute local average */
|
||||
/* set output to local average */
|
||||
*out_point = SCALAR_T(ComputeSum(span_h, span_w, in_point, args) / static_cast<SCALAR_T>(span_h * span_w));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename SCALAR_T>
|
||||
uint32_t AdaptiveAvgPool2dOutFrame(const CpuKernelContext &ctx, AdaptiveCalcArgs<SCALAR_T> args, int64_t num) {
|
||||
auto shard_frame = [&](int64_t start, int64_t end) { ComputeSingleThread(start, end, args); };
|
||||
SWITCH_PARALLEL(shard_frame, args.size_d, num);
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename SCALAR_T>
|
||||
uint32_t AdaptiveAvgPool2dOutTemplate(const CpuKernelContext &ctx) {
|
||||
Tensor &input = *(ctx.Input(kFirstInputIndex));
|
||||
auto input_shape_ptr = input.GetTensorShape();
|
||||
int32_t input_dims = input_shape_ptr->GetDims();
|
||||
KERNEL_CHECK_NULLPTR(input_shape_ptr, KERNEL_STATUS_PARAM_INVALID, "Get input 0 shape failed.");
|
||||
|
||||
KERNEL_CHECK_FALSE((input_dims == kthree || input_dims == kfour), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Non-empty [3D] or [4D] (batch mode) tensor expected for input 0.");
|
||||
|
||||
for (int32_t i = 0; i < input_dims; i++) {
|
||||
KERNEL_CHECK_FALSE((input_shape_ptr->GetDimSize(i) > 0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Adaptive_avg_pool2d: expected input to have non-empty spatial "
|
||||
"dimensions, "
|
||||
"but input 0 has sizes [%d] with dimension [%d] being empty.",
|
||||
input_dims, i);
|
||||
}
|
||||
|
||||
AdaptiveCalcArgs<SCALAR_T> args;
|
||||
// sizes
|
||||
std::vector<int64_t> input_dim_sizes = input_shape_ptr->GetDimSizes();
|
||||
args.size_d = input_dim_sizes.end()[kneg_three];
|
||||
args.in_size_h = input_dim_sizes.end()[kneg_two];
|
||||
args.in_size_w = input_dim_sizes.end()[-1];
|
||||
|
||||
// strides
|
||||
args.in_stride_w = 1;
|
||||
args.in_stride_h = args.in_size_w;
|
||||
args.in_stride_d = args.in_stride_h * args.in_size_h;
|
||||
|
||||
// output sizes
|
||||
AttrValue *attr = ctx.GetAttr("output_size");
|
||||
std::vector<int64_t> output_size_data = attr->GetListInt();
|
||||
if (output_size_data.size() == ktwo) {
|
||||
args.out_size_h = output_size_data[0] > 0 ? output_size_data[0] : input_dim_sizes.end()[-2];
|
||||
args.out_size_w = output_size_data[1] > 0 ? output_size_data[1] : input_dim_sizes.end()[-1];
|
||||
} else if (output_size_data.size() == 1) {
|
||||
KERNEL_CHECK_FALSE((output_size_data[0] >= 0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Adaptive_avg_pool2d: output_size value should be non-negative");
|
||||
args.out_size_h = output_size_data[0];
|
||||
args.out_size_w = output_size_data[0];
|
||||
} else {
|
||||
KERNEL_LOG_ERROR("output_size length should be 1 OR 2, but got [%d]", output_size_data.size());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
// indices will contain i,j locations for each output point
|
||||
args.input_data = static_cast<SCALAR_T *>(input.GetData());
|
||||
args.output_data = static_cast<SCALAR_T *>(ctx.Output(kFirstOutputIndex)->GetData());
|
||||
int64_t num = input.NumElements();
|
||||
// resize output
|
||||
if (input_dims == kthree) {
|
||||
AdaptiveAvgPool2dOutFrame<SCALAR_T>(ctx, args, num);
|
||||
} else {
|
||||
auto shard_template = [&](int64_t start, int64_t end) {
|
||||
for (auto b = start; b < end; b++) {
|
||||
AdaptiveCalcArgs<SCALAR_T> sub_args = args;
|
||||
sub_args.input_data = args.input_data + b * args.in_stride_d * args.size_d;
|
||||
sub_args.output_data = args.output_data + b * args.size_d * args.out_size_h * args.out_size_w;
|
||||
AdaptiveAvgPool2dOutFrame<SCALAR_T>(ctx, sub_args, num);
|
||||
}
|
||||
};
|
||||
SWITCH_PARALLEL(shard_template, input_dim_sizes[0], num);
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t AdaptiveAvgPool2d::Compute(const CpuKernelContext &ctx) {
|
||||
// check params
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output number failed.",
|
||||
kAdaptiveAvgPool2d);
|
||||
|
||||
Tensor *input_0 = ctx.Input(kFirstInputIndex);
|
||||
auto data_type = static_cast<DataType>(input_0->GetDataType());
|
||||
// Compute by data_type
|
||||
switch (data_type) {
|
||||
case DT_FLOAT:
|
||||
return AdaptiveAvgPool2dOutTemplate<float>(ctx);
|
||||
case DT_FLOAT16:
|
||||
return AdaptiveAvgPool2dOutTemplate<Eigen::half>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("AdaptiveAvgPool2d kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
REGISTER_CPU_KERNEL(kAdaptiveAvgPool2d, AdaptiveAvgPool2d);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,46 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_ADAPTIVE_AVG_POOL2D_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_ADAPTIVE_AVG_POOL2D_H_
|
||||
|
||||
#include <cmath>
|
||||
|
||||
#include "cpu_kernel/inc/cpu_ops_kernel.h"
|
||||
#include "cpu_kernel/inc/cpu_types.h"
|
||||
|
||||
namespace aicpu {
|
||||
class AdaptiveAvgPool2d : public CpuKernel {
|
||||
public:
|
||||
AdaptiveAvgPool2d() = default;
|
||||
~AdaptiveAvgPool2d() = default;
|
||||
uint32_t Compute(const CpuKernelContext &ctx) override;
|
||||
};
|
||||
|
||||
inline int StartIndex(int offset, int out_size, int in_size) {
|
||||
if (out_size != 0) {
|
||||
return static_cast<int>(std::floor(static_cast<float>((offset * in_size)) / out_size));
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
inline int EndIndex(int offset, int out_size, int in_size) {
|
||||
if (out_size != 0) {
|
||||
return static_cast<int>(std::ceil(static_cast<float>(((offset + 1) * in_size)) / out_size));
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
} // namespace aicpu
|
||||
#endif // AICPU_KERNELS_NORMALIZED_ADAPTIVE_AVG_POOL3D_H_
|
|
@ -0,0 +1,213 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "cpu_kernel/ms_kernel/adaptive_avg_pool_2d_grad.h"
|
||||
#include <cmath>
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
#include "cpu_kernel/common/cpu_kernel_utils.h"
|
||||
|
||||
namespace {
|
||||
const char *kAdaptiveAvgPool2dGrad = "AdaptiveAvgPool2dGrad";
|
||||
template <typename SCALAR_T>
|
||||
struct AdaptiveCalcArgs {
|
||||
SCALAR_T *input_data = nullptr;
|
||||
SCALAR_T *output_data = nullptr;
|
||||
|
||||
int64_t in_size_b = 0;
|
||||
int64_t in_size_d = 0;
|
||||
int64_t in_size_h = 0;
|
||||
int64_t in_size_w = 0;
|
||||
int64_t out_size_h = 0;
|
||||
int64_t out_size_w = 0;
|
||||
|
||||
int64_t out_stride_d = 0;
|
||||
int64_t in_stride_d = 0;
|
||||
int64_t out_stride_h = 0;
|
||||
int64_t in_stride_h = 0;
|
||||
};
|
||||
|
||||
// out_size is not be zero
|
||||
inline int StartIndex(int offset, int out_size, int in_size) {
|
||||
return (int)std::floor((float)(offset * in_size) / out_size);
|
||||
}
|
||||
|
||||
// out_size is not be zero
|
||||
inline int EndIndex(int offset, int out_size, int in_size) {
|
||||
return (int)std::ceil((float)((offset + 1) * in_size) / out_size);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
template <typename SCALAR_T>
|
||||
uint32_t AdaptiveAvgPool2dGradOutFrame(const CpuKernelContext &ctx, AdaptiveCalcArgs<SCALAR_T> args) {
|
||||
uint32_t min_core_num = 1;
|
||||
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
|
||||
int64_t total_size = args.in_size_d * args.in_size_b * args.out_size_h * args.out_size_w;
|
||||
int64_t max_core_num_total = max_core_num;
|
||||
if (max_core_num_total > total_size) {
|
||||
max_core_num_total = total_size;
|
||||
}
|
||||
auto shard_init = [&](int64_t start, int64_t end) {
|
||||
for (auto c = start; c < end; c++) {
|
||||
args.output_data[c] = (SCALAR_T)0;
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, total_size, total_size / max_core_num_total, shard_init),
|
||||
"AdaptiveAvgPool2dGrad Compute failed.");
|
||||
|
||||
int64_t in_size_db = args.in_size_d * args.in_size_b;
|
||||
if (max_core_num > in_size_db) {
|
||||
max_core_num = in_size_db;
|
||||
}
|
||||
// treat batch size and channels as one dimension
|
||||
auto shard_work = [&](int64_t start, int64_t end) {
|
||||
for (auto c = start; c < end; c++) {
|
||||
SCALAR_T *output_offset_ptr = args.output_data + c * args.out_stride_d;
|
||||
SCALAR_T *input_offset_ptr = args.input_data + c * args.in_stride_d;
|
||||
|
||||
for (int64_t ih = 0; ih < args.in_size_h; ih++) {
|
||||
int64_t out_start_h = StartIndex(ih, args.in_size_h, args.out_size_h);
|
||||
int64_t out_end_h = EndIndex(ih, args.in_size_h, args.out_size_h);
|
||||
int64_t step_h = out_end_h - out_start_h;
|
||||
for (int64_t iw = 0; iw < args.in_size_w; iw++) {
|
||||
int64_t out_start_w = StartIndex(iw, args.in_size_w, args.out_size_w);
|
||||
int64_t out_end_w = EndIndex(iw, args.in_size_w, args.out_size_w);
|
||||
int64_t step_w = out_end_w - out_start_w;
|
||||
if (step_w == 0 || step_h == 0) {
|
||||
continue;
|
||||
}
|
||||
SCALAR_T grad_delta = input_offset_ptr[ih * args.in_stride_h + iw] / step_h / step_w;
|
||||
int64_t oh = 0, ow = 0, output_size = args.out_stride_d;
|
||||
for (oh = out_start_h; oh < out_end_h; oh++) {
|
||||
for (ow = out_start_w; ow < out_end_w; ow++) {
|
||||
int64_t output_idx = oh * args.out_stride_h + ow;
|
||||
KERNEL_CHECK_FALSE_VOID((output_idx < output_size),
|
||||
"Feature map output_idx [%lld] overflow output_size [%lld].", output_idx,
|
||||
output_size);
|
||||
output_offset_ptr[output_idx] += grad_delta;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, in_size_db, in_size_db / max_core_num, shard_work),
|
||||
"AdaptiveAvgPool2dGrad Compute failed.");
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename SCALAR_T>
|
||||
uint32_t AdaptiveAvgPool2dGradOutCpuTemplate(const CpuKernelContext &ctx) {
|
||||
Tensor &input = *(ctx.Input(kFirstInputIndex));
|
||||
|
||||
auto input_shape_ptr = input.GetTensorShape();
|
||||
KERNEL_CHECK_NULLPTR(input_shape_ptr, KERNEL_STATUS_PARAM_INVALID, "Get input x shape failed.");
|
||||
int32_t input_dims = input_shape_ptr->GetDims();
|
||||
|
||||
for (int32_t i = 0; i < input_dims; i++) {
|
||||
KERNEL_CHECK_FALSE((input_shape_ptr->GetDimSize(i) > 0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Adaptive_avg_pool2d_grad: expected input to have non-empty spatial dimensions, "
|
||||
"but input has sizes [%d] with dimension [%d] being empty.",
|
||||
input_dims, i);
|
||||
}
|
||||
|
||||
KERNEL_CHECK_FALSE(input_dims == 4, KERNEL_STATUS_PARAM_INVALID, "Non-empty [4D] tensor expected for input.");
|
||||
|
||||
AdaptiveCalcArgs<SCALAR_T> args;
|
||||
args.in_size_b = 1;
|
||||
args.in_size_d = 0;
|
||||
args.in_size_h = 0;
|
||||
args.in_size_w = 0;
|
||||
args.out_size_h = 0;
|
||||
args.out_size_w = 0;
|
||||
args.out_stride_d = 1;
|
||||
args.in_stride_d = 1;
|
||||
args.out_stride_h = 1;
|
||||
args.in_stride_h = 1;
|
||||
|
||||
std::vector<int64_t> orig_input_size = ctx.GetAttr("orig_input_shape")->GetListInt();
|
||||
KERNEL_CHECK_FALSE((orig_input_size.size() == 4), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Adaptive_avg_pool2d_grad: internal error, orig_input_size.size() must be [4]");
|
||||
KERNEL_CHECK_FALSE((input_shape_ptr->GetDimSize(0) == orig_input_size[0]), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Adaptive_avg_pool2d_grad: internal error, orig_input_size Batch must equal "
|
||||
"input_size Batch, now orig_input_size Batch is [%lld], input_size Batch is [%lld].",
|
||||
input_shape_ptr->GetDimSize(0), orig_input_size[0]);
|
||||
KERNEL_CHECK_FALSE((input_shape_ptr->GetDimSize(1) == orig_input_size[1]), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Adaptive_avg_pool2d_grad: internal error, orig_input_size Channel must equal "
|
||||
"input_size channel, now orig_input_size Channel is [%lld], input_size Channel is [%lld].",
|
||||
input_shape_ptr->GetDimSize(1), orig_input_size[1]);
|
||||
|
||||
int dim_w = 3;
|
||||
int dim_h = 2;
|
||||
// sizes
|
||||
args.in_size_d = input_shape_ptr->GetDimSize(dim_h - 1);
|
||||
args.in_size_h = input_shape_ptr->GetDimSize(dim_h);
|
||||
args.in_size_w = input_shape_ptr->GetDimSize(dim_w);
|
||||
|
||||
args.out_size_h = orig_input_size[dim_h];
|
||||
args.out_size_w = orig_input_size[dim_w];
|
||||
KERNEL_CHECK_FALSE((args.out_size_h != 0 && args.out_size_w != 0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Adaptive_avg_pool2d_grad: internal error, output_size H or W can not be zero, "
|
||||
"now H is [%lld], W is [%lld].",
|
||||
args.out_size_h, args.out_size_w);
|
||||
// strides
|
||||
// The calculation does not overflow because max value is number of user input data,
|
||||
// which less then int64_t range.
|
||||
args.out_stride_d = args.out_size_h * args.out_size_w;
|
||||
args.out_stride_h = args.out_size_w;
|
||||
args.in_stride_d = args.in_size_h * args.in_size_w;
|
||||
args.in_stride_h = args.in_size_w;
|
||||
|
||||
args.input_data = static_cast<SCALAR_T *>(input.GetData());
|
||||
args.output_data = static_cast<SCALAR_T *>(ctx.Output(kFirstOutputIndex)->GetData());
|
||||
|
||||
return AdaptiveAvgPool2dGradOutFrame<SCALAR_T>(ctx, args);
|
||||
}
|
||||
|
||||
uint32_t AdaptiveAvgPool2dGrad::Compute(const CpuKernelContext &ctx) {
|
||||
Tensor *input_0 = ctx.Input(kFirstInputIndex);
|
||||
KERNEL_CHECK_NULLPTR(input_0, KERNEL_STATUS_PARAM_INVALID, "Get input tensor failed.");
|
||||
KERNEL_CHECK_NULLPTR(input_0->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input data failed.");
|
||||
Tensor *output_0 = ctx.Output(kFirstOutputIndex);
|
||||
KERNEL_CHECK_NULLPTR(output_0, KERNEL_STATUS_PARAM_INVALID, "Get output tensor failed.");
|
||||
KERNEL_CHECK_NULLPTR(output_0->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output data failed.");
|
||||
|
||||
AttrValue *attr_orig_input_shape = ctx.GetAttr("orig_input_shape");
|
||||
KERNEL_CHECK_NULLPTR(attr_orig_input_shape, KERNEL_STATUS_PARAM_INVALID, "[%s] get attr:orig_input_shape failed.",
|
||||
kAdaptiveAvgPool2dGrad);
|
||||
std::vector<int64_t> v_orig_input_shape = attr_orig_input_shape->GetListInt();
|
||||
|
||||
KERNEL_LOG_INFO("AdaptiveAvgPool2dGrad kernel, input[0]: size is [%llu]; output_0: size is [%llu].",
|
||||
input_0->GetDataSize(), output_0->GetDataSize());
|
||||
KERNEL_LOG_INFO("[%s] get attr:orig_input_shape [%s].", kAdaptiveAvgPool2dGrad,
|
||||
VectorToString(v_orig_input_shape).c_str());
|
||||
|
||||
auto data_type = static_cast<DataType>(input_0->GetDataType());
|
||||
// Compute by data_type
|
||||
switch (data_type) {
|
||||
case DT_FLOAT:
|
||||
return AdaptiveAvgPool2dGradOutCpuTemplate<float>(ctx);
|
||||
case DT_FLOAT16:
|
||||
return AdaptiveAvgPool2dGradOutCpuTemplate<Eigen::half>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("AdaptiveAvgPool2dGrad kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kAdaptiveAvgPool2dGrad, AdaptiveAvgPool2dGrad);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,32 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_ADAPTIVE_AVG_POOL2D_GRAD_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_ADAPTIVE_AVG_POOL2D_GRAD_H_
|
||||
|
||||
#include "cpu_kernel/inc/cpu_ops_kernel.h"
|
||||
#include "cpu_kernel/inc/cpu_types.h"
|
||||
|
||||
namespace aicpu {
|
||||
class AdaptiveAvgPool2dGrad : public CpuKernel {
|
||||
public:
|
||||
AdaptiveAvgPool2dGrad() = default;
|
||||
~AdaptiveAvgPool2dGrad() = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(const CpuKernelContext &ctx) override;
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif // AICPU_KERNELS_NORMALIZED_ADAPTIVE_AVG_POOL_2DGRAD_H_
|
|
@ -0,0 +1,170 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "log_matrix_determinant.h"
|
||||
|
||||
#include "Eigen/LU"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 2;
|
||||
const uint32_t kInputNum = 1;
|
||||
const uint32_t kIndexTwo = 2;
|
||||
const char *const kLogMatrixDeterminant = "LogMatrixDeterminant";
|
||||
constexpr int64_t kParallelDataNums = 8 * 1024;
|
||||
|
||||
#define LOG_MATRIX_DETERMINANT_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = LogMatrixDeterminantCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("LogMatrixDeterminant kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t LogMatrixDeterminantCpuKernel::Compute(const CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.",
|
||||
kLogMatrixDeterminant);
|
||||
KERNEL_HANDLE_ERROR(LogMatrixDeterminantCheck(ctx), "[%s] check params failed.", kLogMatrixDeterminant);
|
||||
DataType data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
LOG_MATRIX_DETERMINANT_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
LOG_MATRIX_DETERMINANT_COMPUTE_CASE(DT_DOUBLE, double, ctx)
|
||||
LOG_MATRIX_DETERMINANT_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
|
||||
LOG_MATRIX_DETERMINANT_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("LogMatrixDeterminant kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t LogMatrixDeterminantCpuKernel::LogMatrixDeterminantCheck(const CpuKernelContext &ctx) {
|
||||
auto input_0 = ctx.Input(0);
|
||||
auto output_0 = ctx.Output(0);
|
||||
auto output_1 = ctx.Output(1);
|
||||
KERNEL_CHECK_NULLPTR(input_0->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input x data failed.")
|
||||
KERNEL_CHECK_NULLPTR(output_0->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output sign data failed.")
|
||||
KERNEL_CHECK_NULLPTR(output_1->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output y data failed.")
|
||||
|
||||
KERNEL_CHECK_NULLPTR(input_0->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Get input x tensor shape failed.")
|
||||
KERNEL_CHECK_NULLPTR(output_0->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Get output sign tensor shape failed.")
|
||||
KERNEL_CHECK_NULLPTR(output_1->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Get output y tensor shape failed.")
|
||||
std::vector<int64_t> shape_x = input_0->GetTensorShape()->GetDimSizes();
|
||||
std::vector<int64_t> shape_sign = output_0->GetTensorShape()->GetDimSizes();
|
||||
std::vector<int64_t> shape_y = output_1->GetTensorShape()->GetDimSizes();
|
||||
size_t shape_size_x = shape_x.size();
|
||||
size_t shape_size_sign = shape_sign.size();
|
||||
size_t shape_size_y = shape_y.size();
|
||||
KERNEL_CHECK_FALSE((shape_size_x > 1), KERNEL_STATUS_PARAM_INVALID, "Input x must be at least rank 2, got [%zu].",
|
||||
shape_size_x)
|
||||
KERNEL_CHECK_FALSE((shape_x[shape_size_x - 1] > 0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Input x last dimension must be at least 1.")
|
||||
KERNEL_CHECK_FALSE((shape_x[shape_size_x - kIndexTwo] == shape_x[shape_size_x - 1]), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Input x dimensions must be equal, but are [%lld] and [%lld].", shape_x[shape_size_x - kIndexTwo],
|
||||
shape_x[shape_size_x - 1])
|
||||
|
||||
KERNEL_CHECK_FALSE((shape_size_sign == shape_size_x - kIndexTwo), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Output sign must be rank [%zu], got [%zu].", shape_size_x - kIndexTwo, shape_size_sign)
|
||||
KERNEL_CHECK_FALSE((shape_size_y == shape_size_x - kIndexTwo), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Output y must be rank [%zu], got [%zu].", shape_size_x - kIndexTwo, shape_size_y)
|
||||
for (size_t i = 0; i < shape_size_x - kIndexTwo; i++) {
|
||||
KERNEL_CHECK_FALSE((shape_sign[i] == shape_x[i]), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Output sign and Input x dimension [%zu] must be equal, got [%lld] and [%lld].", i,
|
||||
shape_sign[i], shape_x[i])
|
||||
KERNEL_CHECK_FALSE((shape_y[i] == shape_x[i]), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Output y and Input x dimension [%zu] must be equal, got [%lld] and [%lld].", i, shape_y[i],
|
||||
shape_x[i])
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t LogMatrixDeterminantCpuKernel::LogMatrixDeterminantCompute(const CpuKernelContext &ctx) {
|
||||
auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto output_sign = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
auto output_y = reinterpret_cast<T *>(ctx.Output(1)->GetData());
|
||||
|
||||
std::vector<int64_t> shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
size_t shape_size = shape_x.size();
|
||||
int64_t m = shape_x[shape_size - 1];
|
||||
int64_t size_mm = m * m;
|
||||
typedef Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> MartixXd;
|
||||
using RealT = typename Eigen::NumTraits<T>::Real;
|
||||
if (size_mm > 0) {
|
||||
int64_t martix_num = ctx.Input(0)->NumElements() / size_mm;
|
||||
int64_t data_size = ctx.Input(0)->NumElements() * static_cast<int64_t>(sizeof(T));
|
||||
if (data_size <= kParallelDataNums) {
|
||||
for (int64_t i = 0; i < martix_num; i++) {
|
||||
RealT log_abs_det = 0;
|
||||
T sign = 1;
|
||||
Eigen::Map<MartixXd> martix_x(input_x + i * m * m, m, m);
|
||||
if (martix_x.size() > 0) {
|
||||
Eigen::PartialPivLU<MartixXd> lu(martix_x);
|
||||
MartixXd LU = lu.matrixLU();
|
||||
sign = lu.permutationP().determinant();
|
||||
auto diag = LU.diagonal().array().eval();
|
||||
auto abs_diag = diag.cwiseAbs().eval();
|
||||
log_abs_det += abs_diag.log().sum();
|
||||
sign *= (diag / abs_diag).prod();
|
||||
}
|
||||
if (!Eigen::numext::isfinite(log_abs_det)) {
|
||||
sign = 0;
|
||||
log_abs_det = log_abs_det > 0 ? -std::log(RealT(0)) : std::log(RealT(0));
|
||||
}
|
||||
*(output_sign + i) = sign;
|
||||
*(output_y + i) = log_abs_det;
|
||||
}
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
if (max_core_num > martix_num) {
|
||||
max_core_num = martix_num;
|
||||
}
|
||||
auto shard_work = [&](size_t start, size_t end) {
|
||||
RealT log_abs_det = 0;
|
||||
for (size_t i = start; i < end; i++) {
|
||||
log_abs_det = 0;
|
||||
T sign = 1;
|
||||
Eigen::Map<MartixXd> martix_x(input_x + i * m * m, m, m);
|
||||
if (martix_x.size() > 0) {
|
||||
Eigen::PartialPivLU<MartixXd> lu(martix_x);
|
||||
MartixXd LU = lu.matrixLU();
|
||||
sign = static_cast<T>(lu.permutationP().determinant());
|
||||
auto diag = LU.diagonal().array().eval();
|
||||
auto abs_diag = diag.cwiseAbs().eval();
|
||||
log_abs_det += abs_diag.log().sum();
|
||||
sign *= (diag / abs_diag).prod();
|
||||
}
|
||||
if (!Eigen::numext::isfinite(log_abs_det)) {
|
||||
sign = 0;
|
||||
log_abs_det = log_abs_det > 0 ? -std::log(RealT(0)) : std::log(RealT(0));
|
||||
}
|
||||
*(output_sign + i) = sign;
|
||||
*(output_y + i) = log_abs_det;
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, martix_num, martix_num / max_core_num, shard_work),
|
||||
"LogMatrixDeterminant Compute failed.");
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
REGISTER_CPU_KERNEL(kLogMatrixDeterminant, LogMatrixDeterminantCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,35 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_LOG_MATRIX_DETERMINANT_H
|
||||
#define AICPU_KERNELS_NORMALIZED_LOG_MATRIX_DETERMINANT_H
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class LogMatrixDeterminantCpuKernel : public CpuKernel {
|
||||
public:
|
||||
LogMatrixDeterminantCpuKernel() = default;
|
||||
~LogMatrixDeterminantCpuKernel() override = default;
|
||||
uint32_t Compute(const CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t LogMatrixDeterminantCheck(const CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t LogMatrixDeterminantCompute(const CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,212 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "median.h"
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
namespace {
|
||||
const uint32_t kInputNum = 1;
|
||||
const uint32_t kOutputNum = 2;
|
||||
const char *kMedian = "Median";
|
||||
|
||||
#define MEDIAN_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = MedianCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("Median kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
|
||||
#define GLOBAL_MEDIAN_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = GlobalMedianCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("Median kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t MedianCpuKernel::Compute(const CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(MedianCheck(ctx), "Median check params failed.");
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
AttrValue *global_ptr = ctx.GetAttr("global_median");
|
||||
bool global_median_bool = global_ptr->GetBool();
|
||||
if (global_median_bool == false) {
|
||||
switch (data_type) {
|
||||
MEDIAN_COMPUTE_CASE(DT_INT16, int16_t, ctx)
|
||||
MEDIAN_COMPUTE_CASE(DT_INT32, int32_t, ctx)
|
||||
MEDIAN_COMPUTE_CASE(DT_INT64, int64_t, ctx)
|
||||
MEDIAN_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
MEDIAN_COMPUTE_CASE(DT_DOUBLE, double, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Median kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
} else {
|
||||
switch (data_type) {
|
||||
GLOBAL_MEDIAN_COMPUTE_CASE(DT_INT16, int16_t, ctx)
|
||||
GLOBAL_MEDIAN_COMPUTE_CASE(DT_INT32, int32_t, ctx)
|
||||
GLOBAL_MEDIAN_COMPUTE_CASE(DT_INT64, int64_t, ctx)
|
||||
GLOBAL_MEDIAN_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
GLOBAL_MEDIAN_COMPUTE_CASE(DT_DOUBLE, double, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Median kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t MedianCpuKernel::MedianCheck(const CpuKernelContext &ctx) {
|
||||
auto global_median = ctx.GetAttr("global_median");
|
||||
KERNEL_CHECK_NULLPTR(global_median, KERNEL_STATUS_PARAM_INVALID, "Get attr global_median failed.");
|
||||
bool global_median_value = global_median->GetBool();
|
||||
if (global_median_value == false) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Median check input and output number failed.");
|
||||
auto input_shape_ptr = ctx.Input(0)->GetTensorShape();
|
||||
int64_t input_shape_dims = input_shape_ptr->GetDims();
|
||||
int64_t dim_num = 0;
|
||||
AttrValue *dim_ptr = ctx.GetAttr("axis");
|
||||
if (dim_ptr != nullptr) dim_num = dim_ptr->GetInt();
|
||||
if (input_shape_dims != 0) {
|
||||
KERNEL_CHECK_FALSE((dim_num >= (0 - input_shape_dims) && dim_num <= (input_shape_dims - 1)),
|
||||
KERNEL_STATUS_PARAM_INVALID,
|
||||
"IndexError: Dimension out of range "
|
||||
"(expected to be in range of [[%lld], [%lld]], but got [%lld])",
|
||||
(0 - input_shape_dims), (input_shape_dims - 1), dim_num);
|
||||
} else {
|
||||
KERNEL_CHECK_FALSE((dim_num >= -1 && dim_num <= 0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"IndexError: Dimension out of range "
|
||||
"(expected to be in range of [[%lld], [%lld]], but got [%lld])",
|
||||
-1, 0, dim_num);
|
||||
}
|
||||
} else {
|
||||
Tensor *input_0 = ctx.Input(0);
|
||||
KERNEL_CHECK_NULLPTR(input_0, KERNEL_STATUS_PARAM_INVALID, "Get input failed.");
|
||||
KERNEL_CHECK_NULLPTR(input_0->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input data failed.");
|
||||
Tensor *output_0 = ctx.Output(0);
|
||||
KERNEL_CHECK_NULLPTR(output_0, KERNEL_STATUS_PARAM_INVALID, "Get output_0 failed.");
|
||||
KERNEL_CHECK_NULLPTR(output_0->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output data 0 failed.");
|
||||
}
|
||||
if (global_median_value == false) {
|
||||
KERNEL_LOG_DEBUG(
|
||||
"MedianCpuKernel[%s], input0: size[%llu];"
|
||||
"output0: size[%llu], output1: size[%llu].",
|
||||
ctx.GetOpType().c_str(), ctx.Input(0)->GetDataSize(), ctx.Output(0)->GetDataSize(), ctx.Output(1)->GetDataSize());
|
||||
} else {
|
||||
KERNEL_LOG_DEBUG(
|
||||
"MedianCpuKernel[%s], input0: size[%llu];"
|
||||
"output0: size[%llu].",
|
||||
ctx.GetOpType().c_str(), ctx.Input(0)->GetDataSize(), ctx.Output(0)->GetDataSize());
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t MedianCpuKernel::GlobalMedianCompute(const CpuKernelContext &ctx) {
|
||||
auto input_x0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto output_y0 = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
size_t data_num = ctx.Input(0)->GetTensorShape()->NumElements();
|
||||
const int64_t half = 2;
|
||||
std::nth_element(input_x0, input_x0 + static_cast<int64_t>((data_num - 1) / half), input_x0 + data_num);
|
||||
*output_y0 = *(input_x0 + static_cast<int64_t>((data_num - 1) / half));
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t MedianCpuKernel::MedianCompute(const CpuKernelContext &ctx) {
|
||||
auto input_x0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto output_y0 = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
auto output_y1 = reinterpret_cast<int64_t *>(ctx.Output(1)->GetData());
|
||||
|
||||
auto input_shape_ptr = ctx.Input(0)->GetTensorShape();
|
||||
int64_t input_shape_dims = input_shape_ptr->GetDims();
|
||||
if (input_shape_dims == 0) {
|
||||
*output_y0 = *input_x0;
|
||||
*output_y1 = 0;
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
int64_t dim_num = 0;
|
||||
AttrValue *dim_ptr = ctx.GetAttr("axis");
|
||||
if (dim_ptr != nullptr) {
|
||||
dim_num = dim_ptr->GetInt();
|
||||
}
|
||||
if (dim_num < 0) {
|
||||
dim_num += input_shape_dims;
|
||||
}
|
||||
auto input_shape_0 = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
int64_t dim_data_num = input_shape_0[dim_num];
|
||||
T *temp_median_vec = new T[dim_data_num];
|
||||
int64_t *temp_median_index_vec = new int64_t[dim_data_num];
|
||||
int64_t group = 1;
|
||||
int64_t jump = 1;
|
||||
|
||||
int64_t median_pos = static_cast<int64_t>((dim_data_num - 1) / 2);
|
||||
|
||||
if (dim_num != 0) {
|
||||
for (int64_t i = 0; i < dim_num; i++) {
|
||||
group *= input_shape_0[i];
|
||||
}
|
||||
}
|
||||
if (dim_num != input_shape_dims - 1) {
|
||||
for (int64_t i = dim_num + 1; i < input_shape_dims; i++) {
|
||||
jump *= input_shape_0[i];
|
||||
}
|
||||
}
|
||||
|
||||
T *start = input_x0;
|
||||
for (int64_t i = 0; i < group; i++) {
|
||||
for (int64_t j = 0; j < jump; j++) {
|
||||
for (int64_t k = 0; k < dim_data_num; k++) {
|
||||
auto num_index = start + k * jump + j;
|
||||
temp_median_index_vec[k] = k;
|
||||
temp_median_vec[k] = *num_index;
|
||||
}
|
||||
std::nth_element(temp_median_index_vec, temp_median_index_vec + median_pos, temp_median_index_vec + dim_data_num,
|
||||
[&temp_median_vec, dim_data_num](int64_t pos1, int64_t pos2) {
|
||||
return (*(temp_median_vec + pos1) < *(temp_median_vec + pos2)) ||
|
||||
(pos1 >= 0 && pos1 < dim_data_num &&
|
||||
*(temp_median_vec + pos1) == *(temp_median_vec + pos2) && pos1 < pos2);
|
||||
});
|
||||
std::nth_element(temp_median_vec, temp_median_vec + median_pos, temp_median_vec + dim_data_num);
|
||||
*(output_y0 + i * jump + j) = *(temp_median_vec + median_pos);
|
||||
*(output_y1 + i * jump + j) = *(temp_median_index_vec + median_pos);
|
||||
}
|
||||
if (i != group - 1) {
|
||||
start += jump * dim_data_num;
|
||||
}
|
||||
};
|
||||
|
||||
delete[] temp_median_vec;
|
||||
delete[] temp_median_index_vec;
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
REGISTER_CPU_KERNEL(kMedian, MedianCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,38 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_MEDIAN_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_MEDIAN_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class MedianCpuKernel : public CpuKernel {
|
||||
public:
|
||||
MedianCpuKernel() = default;
|
||||
~MedianCpuKernel() override = default;
|
||||
uint32_t Compute(const CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t MedianCheck(const CpuKernelContext &ctx);
|
||||
template <typename T>
|
||||
uint32_t GlobalMedianCompute(const CpuKernelContext &ctx);
|
||||
template <typename T>
|
||||
uint32_t MedianCompute(const CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,280 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "mediangrad.h"
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const char *kMedianGrad = "MedianGrad";
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 4;
|
||||
const uint32_t kGlobalOutputNum = 1;
|
||||
const uint32_t kGlobalInputNum = 3;
|
||||
// when input data size is more than kParallelDataNum, use Parallel func
|
||||
const int64_t kParallelDataNum = 2 * 1024;
|
||||
const int64_t kParallelDataNumMid = 16 * 1024;
|
||||
|
||||
#define MEDIANGRAD_COMPUTE_CASE(DTYPE, TYPE, TYPE2, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = MedianGradCompute<TYPE, TYPE2>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("MedianGrad kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
|
||||
#define GLOBALMEDIANGRAD_COMPUTE_CASE(DTYPE, TYPE, TYPE2, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = GlobalMedianGradCompute<TYPE, TYPE2>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("GlobalMedianGrad kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t MedianGradCpuKernel::Compute(const CpuKernelContext &ctx) {
|
||||
// check params
|
||||
KERNEL_HANDLE_ERROR(MedianGradParamCheck(ctx), "MedianGrad check params failed.");
|
||||
auto data_type_x = ctx.Input(1)->GetDataType();
|
||||
AttrValue *global_median_ptr = ctx.GetAttr("global_median");
|
||||
bool global_median = global_median_ptr->GetBool();
|
||||
if (global_median == false) {
|
||||
switch (data_type_x) {
|
||||
MEDIANGRAD_COMPUTE_CASE(DT_INT16, int16_t, float, ctx)
|
||||
MEDIANGRAD_COMPUTE_CASE(DT_INT32, int32_t, float, ctx)
|
||||
MEDIANGRAD_COMPUTE_CASE(DT_INT64, int64_t, float, ctx)
|
||||
MEDIANGRAD_COMPUTE_CASE(DT_FLOAT, float, float, ctx)
|
||||
MEDIANGRAD_COMPUTE_CASE(DT_DOUBLE, double, double, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("MedianGrad kernel data type [%s] of input x not support.", DTypeStr(data_type_x).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
} else {
|
||||
switch (data_type_x) {
|
||||
GLOBALMEDIANGRAD_COMPUTE_CASE(DT_INT16, int16_t, float, ctx)
|
||||
GLOBALMEDIANGRAD_COMPUTE_CASE(DT_INT32, int32_t, float, ctx)
|
||||
GLOBALMEDIANGRAD_COMPUTE_CASE(DT_INT64, int64_t, float, ctx)
|
||||
GLOBALMEDIANGRAD_COMPUTE_CASE(DT_FLOAT, float, float, ctx)
|
||||
GLOBALMEDIANGRAD_COMPUTE_CASE(DT_DOUBLE, double, double, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("GlobalMedianGrad kernel data type [%s] of input x not support.",
|
||||
DTypeStr(data_type_x).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t MedianGradCpuKernel::MedianGradParamCheck(const CpuKernelContext &ctx) {
|
||||
auto global_median_ptr = ctx.GetAttr("global_median");
|
||||
KERNEL_CHECK_NULLPTR(global_median_ptr, KERNEL_STATUS_PARAM_INVALID, "Get attr global_median failed.");
|
||||
bool global_median = global_median_ptr->GetBool();
|
||||
|
||||
if (global_median == false) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "MedianGrad check input and output number failed.");
|
||||
} else {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kGlobalInputNum, kGlobalOutputNum),
|
||||
"GlobalMedianGrad check input and output number failed.");
|
||||
}
|
||||
|
||||
Tensor *input_y_grad = ctx.Input(0);
|
||||
Tensor *input_x = ctx.Input(1);
|
||||
Tensor *input_y = ctx.Input(2);
|
||||
Tensor *output_x_grad = ctx.Output(0);
|
||||
|
||||
int64_t y_grad_num = ctx.Input(0)->GetTensorShape()->NumElements();
|
||||
int64_t y_num = ctx.Input(2)->GetTensorShape()->NumElements();
|
||||
KERNEL_CHECK_FALSE((y_num == y_grad_num), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data num of input y_grad [%llu] is different from y [%llu].", y_grad_num, y_num)
|
||||
auto data_type_x = ctx.Input(1)->GetDataType();
|
||||
auto data_type_y_grad = ctx.Input(0)->GetDataType();
|
||||
KERNEL_CHECK_FALSE((data_type_y_grad == data_type_x), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of input y_grad [%s] is different from x [%s].", DTypeStr(data_type_y_grad).c_str(),
|
||||
DTypeStr(data_type_x).c_str())
|
||||
|
||||
if (global_median == false) {
|
||||
Tensor *input_indices = ctx.Input(3);
|
||||
KERNEL_LOG_DEBUG(
|
||||
"MedianGradCpuKernel[%s], input_y_grad: size[%llu],"
|
||||
"input_x: size[%llu], input_y: size[%llu],"
|
||||
"input_indices: size[%llu], output_x_grad: size[%llu].",
|
||||
ctx.GetOpType().c_str(), input_y_grad->GetDataSize(), input_x->GetDataSize(), input_y->GetDataSize(),
|
||||
input_indices->GetDataSize(), output_x_grad->GetDataSize());
|
||||
} else {
|
||||
KERNEL_LOG_DEBUG(
|
||||
"MedianGradCpuKernel[%s], input_y_grad: size[%llu],"
|
||||
"input_x: size[%llu], input_y: size[%llu],"
|
||||
"output_x_grad: size[%llu].",
|
||||
ctx.GetOpType().c_str(), input_y_grad->GetDataSize(), input_x->GetDataSize(), input_y->GetDataSize(),
|
||||
output_x_grad->GetDataSize());
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T1, typename T2>
|
||||
uint32_t MedianGradCpuKernel::GlobalMedianGradCompute(const CpuKernelContext &ctx) {
|
||||
auto y_grad = reinterpret_cast<T1 *>(ctx.Input(0)->GetData());
|
||||
auto x = reinterpret_cast<T1 *>(ctx.Input(1)->GetData());
|
||||
auto y = reinterpret_cast<T1 *>(ctx.Input(2)->GetData());
|
||||
auto x_grad = reinterpret_cast<T2 *>(ctx.Output(0)->GetData());
|
||||
int64_t output_data_num = ctx.Output(0)->NumElements();
|
||||
int64_t input_data_num = ctx.Input(1)->NumElements();
|
||||
|
||||
T2 count_repeat = 0;
|
||||
for (int64_t i = 0; i < input_data_num; i++) {
|
||||
count_repeat += (*(x + i) == *y) ? 1 : 0;
|
||||
}
|
||||
|
||||
if (output_data_num >= kParallelDataNum) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
|
||||
if (output_data_num <= kParallelDataNumMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
|
||||
if (max_core_num > output_data_num) {
|
||||
max_core_num = output_data_num;
|
||||
}
|
||||
|
||||
auto sharder_mediangrad = [&](int64_t start, int64_t end) {
|
||||
for (int64_t i = start; i < end; i++) {
|
||||
*(x_grad + i) = (*(x + i) == *y) ? (*y_grad / count_repeat) : 0;
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(
|
||||
CpuKernelUtils::ParallelFor(ctx, output_data_num, output_data_num / max_core_num, sharder_mediangrad),
|
||||
"MedianGrad Compute failed.");
|
||||
} else {
|
||||
for (int64_t i = 0; i < output_data_num; i++) {
|
||||
*(x_grad + i) = (*(x + i) == *y) ? (*y_grad / count_repeat) : 0;
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T1, typename T2>
|
||||
uint32_t MedianGradCpuKernel::MedianGradCompute(const CpuKernelContext &ctx) {
|
||||
auto y_grad = reinterpret_cast<T1 *>(ctx.Input(0)->GetData());
|
||||
auto indices = reinterpret_cast<int64_t *>(ctx.Input(3)->GetData());
|
||||
auto x_grad = reinterpret_cast<T2 *>(ctx.Output(0)->GetData());
|
||||
int64_t output_data_num = ctx.Output(0)->NumElements();
|
||||
int64_t need_calculate_num = ctx.Input(0)->NumElements();
|
||||
|
||||
for (int64_t i = 0; i < output_data_num; i++) {
|
||||
*(x_grad + i) = 0;
|
||||
}
|
||||
|
||||
AttrValue *axis_ptr = ctx.GetAttr("axis");
|
||||
int64_t axis = axis_ptr == nullptr ? 0 : axis_ptr->GetInt();
|
||||
|
||||
std::vector<int64_t> shape_x = ctx.Input(1)->GetTensorShape()->GetDimSizes();
|
||||
std::vector<int64_t> shape_y = ctx.Input(2)->GetTensorShape()->GetDimSizes();
|
||||
|
||||
std::vector<int64_t> shape_keepdim;
|
||||
int64_t dim_num_x = ctx.Input(1)->GetTensorShape()->GetDims();
|
||||
axis = axis >= 0 ? axis : axis + dim_num_x;
|
||||
for (int64_t i = 0; i < dim_num_x; i++) {
|
||||
if (i == axis) {
|
||||
shape_keepdim.push_back(1);
|
||||
} else {
|
||||
shape_keepdim.push_back(shape_x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<int64_t> element_num_each_dim_x;
|
||||
std::vector<int64_t> element_num_each_dim_y;
|
||||
int64_t element_num_y = 1;
|
||||
int64_t element_num_x = 1;
|
||||
for (int64_t i = shape_keepdim.size() - 1; i >= 0; i--) {
|
||||
element_num_each_dim_x.insert(element_num_each_dim_x.begin(), element_num_x);
|
||||
element_num_x *= shape_x[i];
|
||||
element_num_each_dim_y.insert(element_num_each_dim_y.begin(), element_num_y);
|
||||
element_num_y *= shape_keepdim[i];
|
||||
}
|
||||
|
||||
if (need_calculate_num >= kParallelDataNum) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
|
||||
if (need_calculate_num <= kParallelDataNumMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
|
||||
if (max_core_num > need_calculate_num) {
|
||||
max_core_num = need_calculate_num;
|
||||
}
|
||||
|
||||
auto sharder_mediangrad = [&](int64_t start, int64_t end) {
|
||||
std::vector<int64_t> dim_vec;
|
||||
for (int64_t i = 0; i < dim_num_x; i++) {
|
||||
dim_vec.push_back(0);
|
||||
}
|
||||
for (int64_t nth_element = start; nth_element < end; nth_element++) {
|
||||
int64_t elements_remain = nth_element;
|
||||
for (int64_t i = 0; i < dim_num_x; i++) {
|
||||
dim_vec[i] = elements_remain / element_num_each_dim_y[i];
|
||||
elements_remain %= element_num_each_dim_y[i];
|
||||
}
|
||||
int64_t update_element_pos = 0;
|
||||
for (int64_t i = 0; i < dim_num_x; i++) {
|
||||
if (i == axis) {
|
||||
update_element_pos += *(indices + nth_element) * element_num_each_dim_x[i];
|
||||
} else {
|
||||
update_element_pos += dim_vec[i] * element_num_each_dim_x[i];
|
||||
}
|
||||
}
|
||||
*(x_grad + update_element_pos) = *(y_grad + nth_element);
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(
|
||||
CpuKernelUtils::ParallelFor(ctx, need_calculate_num, need_calculate_num / max_core_num, sharder_mediangrad),
|
||||
"MedianGrad Compute failed.");
|
||||
} else {
|
||||
std::vector<int64_t> dim_vec;
|
||||
for (int64_t i = 0; i < dim_num_x; i++) {
|
||||
dim_vec.push_back(0);
|
||||
}
|
||||
for (int64_t nth_element = 0; nth_element < need_calculate_num; nth_element++) {
|
||||
int64_t elements_remain = nth_element;
|
||||
for (int64_t i = 0; i < dim_num_x; i++) {
|
||||
dim_vec[i] = elements_remain / element_num_each_dim_y[i];
|
||||
elements_remain %= element_num_each_dim_y[i];
|
||||
}
|
||||
int64_t update_element_pos = 0;
|
||||
for (int64_t i = 0; i < dim_num_x; i++) {
|
||||
if (i == axis) {
|
||||
update_element_pos += *(indices + nth_element) * element_num_each_dim_x[i];
|
||||
} else {
|
||||
update_element_pos += dim_vec[i] * element_num_each_dim_x[i];
|
||||
}
|
||||
}
|
||||
*(x_grad + update_element_pos) = *(y_grad + nth_element);
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kMedianGrad, MedianGradCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,42 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_MEDIANGRAD_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_MEDIANGRAD_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class MedianGradCpuKernel : public CpuKernel {
|
||||
public:
|
||||
MedianGradCpuKernel() = default;
|
||||
~MedianGradCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(const CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t MedianGradParamCheck(const CpuKernelContext &ctx);
|
||||
|
||||
template <typename T1, typename T2>
|
||||
uint32_t MedianGradCompute(const CpuKernelContext &ctx);
|
||||
|
||||
template <typename T1, typename T2>
|
||||
uint32_t GlobalMedianGradCompute(const CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,21 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "eigen_tensor.h"
|
||||
|
||||
namespace aicpu {
|
||||
const Tensor *EigenTensor::GetTensor() const { return tensor_; }
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,170 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_EIGENTENSOR_H
|
||||
#define AICPU_EIGENTENSOR_H
|
||||
|
||||
#include "cpu_tensor.h"
|
||||
#include "kernel_log.h"
|
||||
#include "unsupported/Eigen/CXX11/Tensor"
|
||||
|
||||
namespace aicpu {
|
||||
// Helper to define Tensor types given that the scalar is of type T.
|
||||
template <typename T, int NDIMS = 1, typename IndexType = Eigen::DenseIndex>
|
||||
struct TTypes {
|
||||
// Rank-<NDIMS> tensor of scalar type T.
|
||||
typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType>, Eigen::Aligned> Tensor;
|
||||
typedef Eigen::TensorMap<Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType>, Eigen::Aligned> ConstTensor;
|
||||
|
||||
// Unaligned Rank-<NDIMS> tensor of scalar type T.
|
||||
typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType> > UnalignedTensor;
|
||||
typedef Eigen::TensorMap<Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType> > UnalignedConstTensor;
|
||||
|
||||
typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, int>, Eigen::Aligned> Tensor32Bit;
|
||||
|
||||
// Scalar tensor (implemented as a rank-0 tensor) of scalar type T.
|
||||
typedef Eigen::TensorMap<Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>, Eigen::Aligned>
|
||||
Scalar;
|
||||
typedef Eigen::TensorMap<Eigen::TensorFixedSize<const T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>, Eigen::Aligned>
|
||||
ConstScalar;
|
||||
|
||||
// Unaligned Scalar tensor of scalar type T.
|
||||
typedef Eigen::TensorMap<Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType> > UnalignedScalar;
|
||||
typedef Eigen::TensorMap<Eigen::TensorFixedSize<const T, Eigen::Sizes<>, Eigen::RowMajor, IndexType> >
|
||||
UnalignedConstScalar;
|
||||
|
||||
// Rank-1 tensor (vector) of scalar type T.
|
||||
typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned> Flat;
|
||||
typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned> ConstFlat;
|
||||
typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned> Vec;
|
||||
typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned> ConstVec;
|
||||
|
||||
// Unaligned Rank-1 tensor (vector) of scalar type T.
|
||||
typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType> > UnalignedFlat;
|
||||
typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType> > UnalignedConstFlat;
|
||||
typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType> > UnalignedVec;
|
||||
typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType> > UnalignedConstVec;
|
||||
|
||||
// Rank-2 tensor (matrix) of scalar type T.
|
||||
typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType>, Eigen::Aligned> Matrix;
|
||||
typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>, Eigen::Aligned> ConstMatrix;
|
||||
|
||||
// Unaligned Rank-2 tensor (matrix) of scalar type T.
|
||||
typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType> > UnalignedMatrix;
|
||||
typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType> > UnalignedConstMatrix;
|
||||
};
|
||||
} // namespace aicpu
|
||||
|
||||
namespace aicpu {
|
||||
|
||||
class EigenTensor {
|
||||
public:
|
||||
EigenTensor() = delete;
|
||||
EigenTensor(Tensor *tensor, void *data) : tensor_(tensor), tensor_data_(data) {}
|
||||
~EigenTensor() = default;
|
||||
|
||||
/*
|
||||
* Get tensor
|
||||
* @return succ: tensor, error : nullptr
|
||||
*/
|
||||
const Tensor *GetTensor() const;
|
||||
|
||||
/*
|
||||
* Eigen vec
|
||||
* @return Eigen vec
|
||||
*/
|
||||
template <typename T>
|
||||
typename TTypes<T>::Vec vec() {
|
||||
return tensor<T, 1>();
|
||||
}
|
||||
|
||||
/*
|
||||
* Eigen matrix
|
||||
* @return Eigen matrix
|
||||
*/
|
||||
template <typename T>
|
||||
typename TTypes<T>::Matrix matrix() {
|
||||
return tensor<T, 2>();
|
||||
}
|
||||
|
||||
/*
|
||||
* Eigen ConstMatrix
|
||||
* @return Eigen ConstMatrix
|
||||
*/
|
||||
template <typename T>
|
||||
typename TTypes<T>::ConstMatrix matrix() const {
|
||||
return tensor<T, 2>();
|
||||
}
|
||||
|
||||
/*
|
||||
* Eigen tensor
|
||||
* @return Eigen tensor
|
||||
*/
|
||||
template <typename T, size_t NDIMS>
|
||||
typename TTypes<T, NDIMS>::Tensor tensor() {
|
||||
return typename TTypes<T, NDIMS>::Tensor(reinterpret_cast<T *>(tensor_data_), AsEigenDSizes<NDIMS>());
|
||||
}
|
||||
|
||||
/*
|
||||
* Eigen ConstTensor
|
||||
* @return Eigen ConstTensor
|
||||
*/
|
||||
template <typename T, size_t NDIMS>
|
||||
typename TTypes<T, NDIMS>::ConstTensor tensor() const {
|
||||
return typename TTypes<T, NDIMS>::ConstTensor(reinterpret_cast<const T *>(tensor_data_), AsEigenDSizes<NDIMS>());
|
||||
}
|
||||
|
||||
/*
|
||||
* Eigen Flat
|
||||
* @return Eigen Flat
|
||||
*/
|
||||
template <typename T>
|
||||
typename TTypes<T>::Flat flat() {
|
||||
return typename TTypes<T>::Flat(reinterpret_cast<T *>(tensor_data_), {tensor_->GetTensorShape()->NumElements()});
|
||||
}
|
||||
|
||||
/*
|
||||
* which case we pad the rest of the sizes with 1.
|
||||
* @return Eigen::DSizes: pad the rest of the sizes with 1
|
||||
*/
|
||||
template <int NDIMS, typename IndexType>
|
||||
Eigen::DSizes<IndexType, NDIMS> AsEigenDSizesWithPadding() const {
|
||||
Eigen::DSizes<IndexType, NDIMS> dsizes;
|
||||
for (int d = 0; d < tensor_->GetTensorShape()->GetDims(); d++) {
|
||||
dsizes[d] = static_cast<IndexType>(tensor_->GetTensorShape()->GetDimSize(d));
|
||||
}
|
||||
for (int d = tensor_->GetTensorShape()->GetDims(); d < NDIMS; d++) {
|
||||
dsizes[d] = 1;
|
||||
}
|
||||
return dsizes;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fill `*dsizes` from `*this`
|
||||
* @return Eigen::DSizes: pad the rest of the sizes with 1
|
||||
*/
|
||||
template <int NDIMS, typename IndexType = Eigen::DenseIndex>
|
||||
Eigen::DSizes<IndexType, NDIMS> AsEigenDSizes() const {
|
||||
return AsEigenDSizesWithPadding<NDIMS, IndexType>();
|
||||
}
|
||||
|
||||
private:
|
||||
Tensor *tensor_;
|
||||
void *tensor_data_;
|
||||
};
|
||||
} // namespace aicpu
|
||||
|
||||
#endif // AICPU_EIGENTENSOR_H
|
|
@ -48,7 +48,14 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An
|
|||
kSliceGradOpName,
|
||||
kRandomShuffleOpName,
|
||||
kRangeOpName};
|
||||
static const std::set<std::string> kMigrateAicpuKernelOps = {kACosOpName};
|
||||
static const std::set<std::string> kMigrateAicpuKernelOps = {
|
||||
mindspore::kACosOpName,
|
||||
mindspore::kLogMatrixDeterminantOpName,
|
||||
mindspore::kAdaptiveAvgPool2dOpName,
|
||||
mindspore::kAdaptiveAvgPool2dGradOpName,
|
||||
mindspore::kMedianOpName,
|
||||
mindspore::kMedianGradOpName,
|
||||
};
|
||||
static const std::string kEnvOpSoNames = "mindspore_aicpu_kernels";
|
||||
static const std::string kCpuKernelSoName = "mindspore_cpu_kernels";
|
||||
|
||||
|
|
Loading…
Reference in New Issue