fix approximate_equal & trunc & ApplyAdagradV2 & ApplyGradientDescent & SparseApplyAdagradV2

truncate_div & truncate_mod & SparseApplyFtrl & population_count cpu performance
This commit is contained in:
panfengfeng 2022-08-20 19:08:49 +08:00
parent 70a3e5b65d
commit 090937241b
9 changed files with 75 additions and 81 deletions

View File

@ -145,19 +145,7 @@ bool ApplyAdagradV2CpuKernelMod::LaunchKernel(const std::vector<kernel::AddressP
var[i] -= lr[batch_index] * gradient[i] * (one / dividend);
}
};
ParallelLaunchAutoSearch(task, length, this, &parallel_search_info_);
// Copy result to output tensor
auto output_var = reinterpret_cast<T *>(outputs[0]->addr);
auto output_accum = reinterpret_cast<T *>(outputs[1]->addr);
auto ret = memcpy_s(output_var, outputs[0]->size, var, inputs[0]->size);
if (ret != EOK) {
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', launch kernel error: memcpy failed. Error no: " << ret;
}
ret = memcpy_s(output_accum, outputs[1]->size, accum, inputs[1]->size);
if (ret != EOK) {
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', launch kernel error: memcpy failed. Error no: " << ret;
}
ParallelLaunch(task, length, 0, this, pool_);
return true;
}
@ -176,7 +164,9 @@ std::vector<std::pair<KernelAttr, ApplyAdagradV2CpuKernelMod::ApplyAdagradV2Func
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
.AddOutputAttr(kNumberTypeFloat32)
.AddOutInRef(0, 0)
.AddOutInRef(1, 1),
&ApplyAdagradV2CpuKernelMod::LaunchKernel<float>},
{KernelAttr()
.AddInputAttr(kNumberTypeFloat16)
@ -184,7 +174,9 @@ std::vector<std::pair<KernelAttr, ApplyAdagradV2CpuKernelMod::ApplyAdagradV2Func
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16),
.AddOutputAttr(kNumberTypeFloat16)
.AddOutInRef(0, 0)
.AddOutInRef(1, 1),
&ApplyAdagradV2CpuKernelMod::LaunchKernel<float16>},
};

View File

@ -90,7 +90,7 @@ void ApplyGradientDescentCpuKernelMod::LaunchKernel(const std::vector<AddressPtr
output_addr[pos] = var_addr[pos];
}
};
ParallelLaunchAutoSearch(task, input_size_, this, &parallel_search_info_);
ParallelLaunch(task, input_size_, 0, this, pool_);
}
MS_KERNEL_FACTORY_REG(NativeCpuKernelMod, ApplyGradientDescent, ApplyGradientDescentCpuKernelMod);

View File

@ -96,7 +96,7 @@ bool ApproximateEqualCpuKernelMod::LaunchKernel(const std::vector<kernel::Addres
output[i] = abs(x[i] - y[i]) < tol ? true : false;
}
};
ParallelLaunchAutoSearch(task, length, this, &parallel_search_info_);
ParallelLaunch(task, length, 0, this, pool_);
return true;
}

View File

@ -105,7 +105,7 @@ bool PopulationCountCpuKernelMod::LaunchKernel(const std::vector<AddressPtr> &in
constexpr size_t min_block_size = 1024;
auto block_size = std::max(min_block_size, length / GetActorMgrInnerThreadPool()->GetKernelThreadNum());
auto task = std::bind(PopulationCount<T>, input_0_addr, output_0_addr, std::placeholders::_1, std::placeholders::_2);
ParallelLaunch(task, length, block_size, this);
ParallelLaunch(task, length, block_size, this, pool_);
return true;
}

View File

@ -35,6 +35,7 @@ constexpr char kKernelName[] = "SparseApplyAdagradV2";
using KernelRunFunc = SparseApplyAdagradV2CpuKernelMod::KernelRunFunc;
/*
template <typename T>
void ComputeAdaGrad(MultiThreadComputeParams<T> *input_params, size_t start, size_t end) {
MS_EXCEPTION_IF_NULL(input_params);
@ -64,6 +65,7 @@ void ComputeAdaGrad(MultiThreadComputeParams<T> *input_params, size_t start, siz
}
}
}
*/
} // namespace
template <typename T>
@ -203,31 +205,35 @@ bool SparseApplyAdagradV2CpuKernelMod::LaunchKernel(const std::vector<kernel::Ad
auto *accum = reinterpret_cast<float *>(inputs[1]->addr);
auto *grad = reinterpret_cast<float *>(inputs[2]->addr);
auto *indices = reinterpret_cast<T *>(inputs[3]->addr);
auto *new_grad = reinterpret_cast<float *>(workspace[0]->addr);
auto *new_indices = reinterpret_cast<T *>(workspace[1]->addr);
auto *workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
auto *workspace_indices = reinterpret_cast<T *>(workspace[3]->addr);
SparseGradient<T> unique_sparse_grad({new_grad, new_indices, indices_size_});
SparseGradient<T> workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_});
SparseGradient<T> input_sparse_grad({grad, indices, indices_size_});
ReduceSparseGradientParam<T> param;
param.input_grad_ = &input_sparse_grad;
param.workspace_grad_ = &workspace_sparse_grad;
param.output_grad_ = &unique_sparse_grad;
param.max_index_ = var_first_dim_size_;
param.value_stride_ = var_outer_dim_size_;
BucketReduceSparseGradient(param);
MultiThreadComputeParams<T> input_params;
input_params.var_ = var;
input_params.accum_ = accum;
input_params.lr_ = lr_;
input_params.epsilon_ = lr_;
input_params.update_slots_ = update_slots_;
input_params.sparse_grad_ = unique_sparse_grad;
input_params.var_first_dim_size_ = var_first_dim_size_;
input_params.var_outer_dim_size_ = var_outer_dim_size_;
MultiThreadCompute<T>(ComputeAdaGrad<T>, &input_params, unique_sparse_grad.indices_size_);
const auto lr = lr_;
const auto epsilon = lr_;
const auto update_slots = update_slots_;
const auto unique_sparse_grad = input_sparse_grad;
const auto var_first_dim_size = var_first_dim_size_;
const auto var_outer_dim_size = var_outer_dim_size_;
auto task = [this, &var, &accum, lr, epsilon, update_slots, &unique_sparse_grad, var_first_dim_size,
var_outer_dim_size](size_t start, size_t end) {
for (size_t i = start; i < end; ++i) {
T index = unique_sparse_grad.indices_[i];
if (index < 0 || LongToSize(index) >= var_first_dim_size) {
MS_LOG(EXCEPTION) << "For '" << kKernelName << "', each element in 'indices' must be in range [0, "
<< SizeToLong(var_first_dim_size) << "), but got " << index;
}
size_t start_index = var_outer_dim_size * static_cast<size_t>(index);
size_t end_index = start_index + var_outer_dim_size;
for (size_t j = start_index, k = var_outer_dim_size * i; j < end_index; ++j, ++k) {
auto summed_grad = unique_sparse_grad.value_[k];
if (update_slots) {
accum[j] += summed_grad * summed_grad;
}
auto learning_rate = lr * (1 / std::sqrt(accum[j] + epsilon));
var[j] -= summed_grad * learning_rate;
}
}
};
ParallelLaunch(task, indices_size_, 0);
return true;
}

View File

@ -34,6 +34,7 @@ constexpr size_t kGradIndex = 3;
constexpr size_t kIndicesIndex = 4;
constexpr size_t kSparseApplyFtrlInputsNum = 5;
constexpr size_t kSparseApplyFtrlWorkspaceSize = 4;
constexpr size_t kSizeGap = 16;
constexpr char kKernelName[] = "SparseApplyFtrl";
using KernelRunFunc = SparseApplyFtrlCpuKernelMod::KernelRunFunc;
@ -403,7 +404,9 @@ const std::vector<std::pair<KernelAttr, KernelRunFunc>> &SparseApplyFtrlCpuKerne
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutInRef(0, 0),
.AddOutInRef(0, 0)
.AddOutInRef(1, 1)
.AddOutInRef(2, 2),
&SparseApplyFtrlCpuKernelMod::LaunchKernel<float, int>},
{KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
@ -414,7 +417,9 @@ const std::vector<std::pair<KernelAttr, KernelRunFunc>> &SparseApplyFtrlCpuKerne
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutInRef(0, 0),
.AddOutInRef(0, 0)
.AddOutInRef(1, 1)
.AddOutInRef(2, 2),
&SparseApplyFtrlCpuKernelMod::LaunchKernel<float, int64_t>}};
return func_list;
}
@ -422,15 +427,12 @@ const std::vector<std::pair<KernelAttr, KernelRunFunc>> &SparseApplyFtrlCpuKerne
template <typename T, typename S>
bool SparseApplyFtrlCpuKernelMod::LaunchKernel(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> &workspace,
const std::vector<kernel::AddressPtr> &outputs) const {
const std::vector<kernel::AddressPtr> &) const {
auto *var = reinterpret_cast<T *>(inputs[kVarIndex]->addr);
auto *accum = reinterpret_cast<T *>(inputs[kAccumIndex]->addr);
auto *linear = reinterpret_cast<T *>(inputs[kLinearIndex]->addr);
auto *grad = reinterpret_cast<T *>(inputs[kGradIndex]->addr);
auto *indices = reinterpret_cast<S *>(inputs[kIndicesIndex]->addr);
auto *var_out = reinterpret_cast<T *>(outputs[kVarIndex]->addr);
auto *accum_out = reinterpret_cast<T *>(outputs[kAccumIndex]->addr);
auto *linear_out = reinterpret_cast<T *>(outputs[kLinearIndex]->addr);
SparseGradient<S> input_sparse_grad({grad, indices, indices_size_});
MultiThreadComputeParams<S> input_params;
@ -444,20 +446,10 @@ bool SparseApplyFtrlCpuKernelMod::LaunchKernel(const std::vector<kernel::Address
input_params.sparse_grad_ = input_sparse_grad;
input_params.var_first_dim_size_ = var_first_dim_size_;
input_params.var_outer_dim_size_ = var_outer_dim_size_;
MultiThreadCompute<S>(ComputeFtrl<S>, &input_params, indices_size_);
// assign results back to outputs.
auto ret = memcpy_s(var_out, outputs[kVarIndex]->size, var, inputs[kVarIndex]->size);
if (ret != EOK) {
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', launch kernel error: memcpy failed. Error no: " << ret;
}
ret = memcpy_s(accum_out, outputs[kAccumIndex]->size, accum, inputs[kAccumIndex]->size);
if (ret != EOK) {
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', launch kernel error: memcpy failed. Error no: " << ret;
}
ret = memcpy_s(linear_out, outputs[kLinearIndex]->size, linear, inputs[kLinearIndex]->size);
if (ret != EOK) {
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', launch kernel error: memcpy failed. Error no: " << ret;
if (indices_size_ < kSizeGap) {
ComputeFtrl(&input_params, 0, indices_size_);
} else {
MultiThreadCompute<S>(ComputeFtrl<S>, &input_params, indices_size_);
}
return true;
}

View File

@ -24,18 +24,16 @@ namespace {
const size_t kZero = 0;
constexpr size_t kTruncInputsNum = 1;
constexpr size_t kTruncOutputsNum = 1;
constexpr size_t kSizeGapMin = 1024;
constexpr size_t kSizeGapMax = 102400;
template <typename T>
void Trunc(const T *in0, T *out0, size_t start, size_t end) {
for (size_t index = start; index < end; index++) {
int ind = static_cast<int>(in0[index]);
if (std::is_same_v<T, std::uint8_t>) {
if constexpr ((std::is_same_v<T, uint8_t>) || (std::is_same_v<T, int8_t>) || (std::is_same_v<T, int32_t>)) {
out0[index] = in0[index];
} else {
auto absvalue1 = (in0[index]) * (in0[index]);
auto absvalue = sqrt(absvalue1);
auto retp = floor(absvalue);
out0[index] = (ind < 0) ? -retp : retp;
} else if constexpr ((std::is_same_v<T, float>) || (std::is_same_v<T, double>)) {
out0[index] = std::trunc(in0[index]);
}
}
}
@ -68,9 +66,7 @@ bool TruncCpuKernelMod::Launch(const std::vector<kernel::AddressPtr> &inputs,
bool ret = true;
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kTruncInputsNum, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kTruncOutputsNum, kernel_name_);
if (dtype_ == kNumberTypeFloat16) {
ret = LaunchKernel<float16>(inputs, outputs);
} else if (dtype_ == kNumberTypeFloat32) {
if (dtype_ == kNumberTypeFloat32) {
ret = LaunchKernel<float>(inputs, outputs);
} else if (dtype_ == kNumberTypeFloat64) {
ret = LaunchKernel<double>(inputs, outputs);
@ -92,13 +88,18 @@ bool TruncCpuKernelMod::LaunchKernel(const std::vector<AddressPtr> &inputs, cons
const T *input_0_addr = reinterpret_cast<T *>(inputs[kZero]->addr);
T *output_0_addr = reinterpret_cast<T *>(outputs[kZero]->addr);
auto task = std::bind(Trunc<T>, input_0_addr, output_0_addr, std::placeholders::_1, std::placeholders::_2);
ParallelLaunchAutoSearch(task, input_size_ * kTruncInputsNum, this, &parallel_search_info_);
if (input_size_ <= kSizeGapMin) {
Trunc(input_0_addr, output_0_addr, 0, input_size_ * kTruncInputsNum);
} else if (input_size_ <= kSizeGapMax) {
ParallelLaunchAutoSearch(task, input_size_ * kTruncInputsNum, this, &parallel_search_info_);
} else {
ParallelLaunch(task, input_size_ * kTruncInputsNum, 0, this);
}
return true;
}
std::vector<KernelAttr> TruncCpuKernelMod::GetOpSupport() {
static std::vector<KernelAttr> support_list = {
KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8),

View File

@ -1869,8 +1869,8 @@ def grid_sample(input_x, grid, interpolation_mode='bilinear', padding_mode='zero
Examples:
>>> input_x = Tensor(np.arange(16).reshape((2, 2, 2, 2)).astype(np.float32))
>>> grid = Tensor(np.arange(0.2, 1, 0.1).reshape((2, 2, 1, 2)).astype(np.float32))
>>> output = F.grid_sample(input_x, grid, interpolation_mode='bilinear', padding_mode='zeros',
... align_corners=True)
>>> output = ops.grid_sample(input_x, grid, interpolation_mode='bilinear', padding_mode='zeros',
... align_corners=True)
>>> print(output)
[[[[ 1.9 ]
[ 2.1999998]]
@ -1993,8 +1993,8 @@ def ctc_greedy_decoder(inputs, sequence_length, merge_repeated=True):
>>> inputs = Tensor(np.array([[[0.6, 0.4, 0.2], [0.8, 0.6, 0.3]],
... [[0.0, 0.6, 0.0], [0.5, 0.4, 0.5]]]), mindspore.float32)
>>> sequence_length = Tensor(np.array([2, 2]), mindspore.int32)
>>> decoded_indices, decoded_values, decoded_shape, log_probability = F.ctc_greedy_decoder(inputs,
... sequence_length)
>>> decoded_indices, decoded_values, decoded_shape, log_probability = ops.ctc_greedy_decoder(inputs,
... sequence_length)
>>> print(decoded_indices)
[[0 0]
[0 1]

View File

@ -2001,16 +2001,19 @@ class SparseCountSparseOutput(Primitive):
RunTimeError: If indexes are not in bounds of the dense shape
Examples:
>>> from mindspore.ops.operations.sparse_ops import SparseCountSparseOutput
>>> indices = Tensor([[1, 2] ,[2, 3], [2, 1], [0, 2]], dtype=mstype.int64)
>>> values = Tensor([0, 2, 8, 8], dtype=mstype.int64)
>>> dense_shape = Tensor([4, 4], dtype=mstype.int64)
>>> weights = Tensor([1, 2, 1, 0], dtype=mstype.int64)
>>> sparse_count_sparse_output = ops.SparseCountSparseOutput()
>>> sparse_count_sparse_output = SparseCountSparseOutput()
>>> out = sparse_count_sparse_output(indices, values, dense_shape, weights)
>>> print(out)
(Tensor(shape=[4, 2], dtype=Int64, value= [[0, 8], [1, 0], [2, 2], [2, 8]]),
Tensor(shape=[4], dtype=Int64, value= [0, 1, 2, 1]),
Tensor(shape=[2], dtype=Int64, value= [4, 9])
(Tensor(shape=[4, 2], dtype=Int64, value=
[[0, 8],
[1, 0],
[2, 2],
[2, 8]]), Tensor(shape=[4], dtype=Int64, value= [0, 1, 2, 1]), Tensor(shape=[2], dtype=Int64, value= [4, 9]))
Supported Platforms:
``CPU``