forked from mindspore-Ecosystem/mindspore
fix approximate_equal & trunc & ApplyAdagradV2 & ApplyGradientDescent & SparseApplyAdagradV2
truncate_div & truncate_mod & SparseApplyFtrl & population_count cpu performance
This commit is contained in:
parent
70a3e5b65d
commit
090937241b
|
@ -145,19 +145,7 @@ bool ApplyAdagradV2CpuKernelMod::LaunchKernel(const std::vector<kernel::AddressP
|
|||
var[i] -= lr[batch_index] * gradient[i] * (one / dividend);
|
||||
}
|
||||
};
|
||||
ParallelLaunchAutoSearch(task, length, this, ¶llel_search_info_);
|
||||
|
||||
// Copy result to output tensor
|
||||
auto output_var = reinterpret_cast<T *>(outputs[0]->addr);
|
||||
auto output_accum = reinterpret_cast<T *>(outputs[1]->addr);
|
||||
auto ret = memcpy_s(output_var, outputs[0]->size, var, inputs[0]->size);
|
||||
if (ret != EOK) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', launch kernel error: memcpy failed. Error no: " << ret;
|
||||
}
|
||||
ret = memcpy_s(output_accum, outputs[1]->size, accum, inputs[1]->size);
|
||||
if (ret != EOK) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', launch kernel error: memcpy failed. Error no: " << ret;
|
||||
}
|
||||
ParallelLaunch(task, length, 0, this, pool_);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -176,7 +164,9 @@ std::vector<std::pair<KernelAttr, ApplyAdagradV2CpuKernelMod::ApplyAdagradV2Func
|
|||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32),
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutInRef(0, 0)
|
||||
.AddOutInRef(1, 1),
|
||||
&ApplyAdagradV2CpuKernelMod::LaunchKernel<float>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
|
@ -184,7 +174,9 @@ std::vector<std::pair<KernelAttr, ApplyAdagradV2CpuKernelMod::ApplyAdagradV2Func
|
|||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddOutputAttr(kNumberTypeFloat16)
|
||||
.AddOutputAttr(kNumberTypeFloat16),
|
||||
.AddOutputAttr(kNumberTypeFloat16)
|
||||
.AddOutInRef(0, 0)
|
||||
.AddOutInRef(1, 1),
|
||||
&ApplyAdagradV2CpuKernelMod::LaunchKernel<float16>},
|
||||
};
|
||||
|
||||
|
|
|
@ -90,7 +90,7 @@ void ApplyGradientDescentCpuKernelMod::LaunchKernel(const std::vector<AddressPtr
|
|||
output_addr[pos] = var_addr[pos];
|
||||
}
|
||||
};
|
||||
ParallelLaunchAutoSearch(task, input_size_, this, ¶llel_search_info_);
|
||||
ParallelLaunch(task, input_size_, 0, this, pool_);
|
||||
}
|
||||
|
||||
MS_KERNEL_FACTORY_REG(NativeCpuKernelMod, ApplyGradientDescent, ApplyGradientDescentCpuKernelMod);
|
||||
|
|
|
@ -96,7 +96,7 @@ bool ApproximateEqualCpuKernelMod::LaunchKernel(const std::vector<kernel::Addres
|
|||
output[i] = abs(x[i] - y[i]) < tol ? true : false;
|
||||
}
|
||||
};
|
||||
ParallelLaunchAutoSearch(task, length, this, ¶llel_search_info_);
|
||||
ParallelLaunch(task, length, 0, this, pool_);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -105,7 +105,7 @@ bool PopulationCountCpuKernelMod::LaunchKernel(const std::vector<AddressPtr> &in
|
|||
constexpr size_t min_block_size = 1024;
|
||||
auto block_size = std::max(min_block_size, length / GetActorMgrInnerThreadPool()->GetKernelThreadNum());
|
||||
auto task = std::bind(PopulationCount<T>, input_0_addr, output_0_addr, std::placeholders::_1, std::placeholders::_2);
|
||||
ParallelLaunch(task, length, block_size, this);
|
||||
ParallelLaunch(task, length, block_size, this, pool_);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -35,6 +35,7 @@ constexpr char kKernelName[] = "SparseApplyAdagradV2";
|
|||
|
||||
using KernelRunFunc = SparseApplyAdagradV2CpuKernelMod::KernelRunFunc;
|
||||
|
||||
/*
|
||||
template <typename T>
|
||||
void ComputeAdaGrad(MultiThreadComputeParams<T> *input_params, size_t start, size_t end) {
|
||||
MS_EXCEPTION_IF_NULL(input_params);
|
||||
|
@ -64,6 +65,7 @@ void ComputeAdaGrad(MultiThreadComputeParams<T> *input_params, size_t start, siz
|
|||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
} // namespace
|
||||
|
||||
template <typename T>
|
||||
|
@ -203,31 +205,35 @@ bool SparseApplyAdagradV2CpuKernelMod::LaunchKernel(const std::vector<kernel::Ad
|
|||
auto *accum = reinterpret_cast<float *>(inputs[1]->addr);
|
||||
auto *grad = reinterpret_cast<float *>(inputs[2]->addr);
|
||||
auto *indices = reinterpret_cast<T *>(inputs[3]->addr);
|
||||
auto *new_grad = reinterpret_cast<float *>(workspace[0]->addr);
|
||||
auto *new_indices = reinterpret_cast<T *>(workspace[1]->addr);
|
||||
auto *workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
|
||||
auto *workspace_indices = reinterpret_cast<T *>(workspace[3]->addr);
|
||||
|
||||
SparseGradient<T> unique_sparse_grad({new_grad, new_indices, indices_size_});
|
||||
SparseGradient<T> workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_});
|
||||
SparseGradient<T> input_sparse_grad({grad, indices, indices_size_});
|
||||
ReduceSparseGradientParam<T> param;
|
||||
param.input_grad_ = &input_sparse_grad;
|
||||
param.workspace_grad_ = &workspace_sparse_grad;
|
||||
param.output_grad_ = &unique_sparse_grad;
|
||||
param.max_index_ = var_first_dim_size_;
|
||||
param.value_stride_ = var_outer_dim_size_;
|
||||
BucketReduceSparseGradient(param);
|
||||
MultiThreadComputeParams<T> input_params;
|
||||
input_params.var_ = var;
|
||||
input_params.accum_ = accum;
|
||||
input_params.lr_ = lr_;
|
||||
input_params.epsilon_ = lr_;
|
||||
input_params.update_slots_ = update_slots_;
|
||||
input_params.sparse_grad_ = unique_sparse_grad;
|
||||
input_params.var_first_dim_size_ = var_first_dim_size_;
|
||||
input_params.var_outer_dim_size_ = var_outer_dim_size_;
|
||||
MultiThreadCompute<T>(ComputeAdaGrad<T>, &input_params, unique_sparse_grad.indices_size_);
|
||||
const auto lr = lr_;
|
||||
const auto epsilon = lr_;
|
||||
const auto update_slots = update_slots_;
|
||||
const auto unique_sparse_grad = input_sparse_grad;
|
||||
const auto var_first_dim_size = var_first_dim_size_;
|
||||
const auto var_outer_dim_size = var_outer_dim_size_;
|
||||
auto task = [this, &var, &accum, lr, epsilon, update_slots, &unique_sparse_grad, var_first_dim_size,
|
||||
var_outer_dim_size](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; ++i) {
|
||||
T index = unique_sparse_grad.indices_[i];
|
||||
if (index < 0 || LongToSize(index) >= var_first_dim_size) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kKernelName << "', each element in 'indices' must be in range [0, "
|
||||
<< SizeToLong(var_first_dim_size) << "), but got " << index;
|
||||
}
|
||||
size_t start_index = var_outer_dim_size * static_cast<size_t>(index);
|
||||
size_t end_index = start_index + var_outer_dim_size;
|
||||
for (size_t j = start_index, k = var_outer_dim_size * i; j < end_index; ++j, ++k) {
|
||||
auto summed_grad = unique_sparse_grad.value_[k];
|
||||
if (update_slots) {
|
||||
accum[j] += summed_grad * summed_grad;
|
||||
}
|
||||
auto learning_rate = lr * (1 / std::sqrt(accum[j] + epsilon));
|
||||
var[j] -= summed_grad * learning_rate;
|
||||
}
|
||||
}
|
||||
};
|
||||
ParallelLaunch(task, indices_size_, 0);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -34,6 +34,7 @@ constexpr size_t kGradIndex = 3;
|
|||
constexpr size_t kIndicesIndex = 4;
|
||||
constexpr size_t kSparseApplyFtrlInputsNum = 5;
|
||||
constexpr size_t kSparseApplyFtrlWorkspaceSize = 4;
|
||||
constexpr size_t kSizeGap = 16;
|
||||
constexpr char kKernelName[] = "SparseApplyFtrl";
|
||||
|
||||
using KernelRunFunc = SparseApplyFtrlCpuKernelMod::KernelRunFunc;
|
||||
|
@ -403,7 +404,9 @@ const std::vector<std::pair<KernelAttr, KernelRunFunc>> &SparseApplyFtrlCpuKerne
|
|||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutInRef(0, 0),
|
||||
.AddOutInRef(0, 0)
|
||||
.AddOutInRef(1, 1)
|
||||
.AddOutInRef(2, 2),
|
||||
&SparseApplyFtrlCpuKernelMod::LaunchKernel<float, int>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
|
@ -414,7 +417,9 @@ const std::vector<std::pair<KernelAttr, KernelRunFunc>> &SparseApplyFtrlCpuKerne
|
|||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutInRef(0, 0),
|
||||
.AddOutInRef(0, 0)
|
||||
.AddOutInRef(1, 1)
|
||||
.AddOutInRef(2, 2),
|
||||
&SparseApplyFtrlCpuKernelMod::LaunchKernel<float, int64_t>}};
|
||||
return func_list;
|
||||
}
|
||||
|
@ -422,15 +427,12 @@ const std::vector<std::pair<KernelAttr, KernelRunFunc>> &SparseApplyFtrlCpuKerne
|
|||
template <typename T, typename S>
|
||||
bool SparseApplyFtrlCpuKernelMod::LaunchKernel(const std::vector<kernel::AddressPtr> &inputs,
|
||||
const std::vector<kernel::AddressPtr> &workspace,
|
||||
const std::vector<kernel::AddressPtr> &outputs) const {
|
||||
const std::vector<kernel::AddressPtr> &) const {
|
||||
auto *var = reinterpret_cast<T *>(inputs[kVarIndex]->addr);
|
||||
auto *accum = reinterpret_cast<T *>(inputs[kAccumIndex]->addr);
|
||||
auto *linear = reinterpret_cast<T *>(inputs[kLinearIndex]->addr);
|
||||
auto *grad = reinterpret_cast<T *>(inputs[kGradIndex]->addr);
|
||||
auto *indices = reinterpret_cast<S *>(inputs[kIndicesIndex]->addr);
|
||||
auto *var_out = reinterpret_cast<T *>(outputs[kVarIndex]->addr);
|
||||
auto *accum_out = reinterpret_cast<T *>(outputs[kAccumIndex]->addr);
|
||||
auto *linear_out = reinterpret_cast<T *>(outputs[kLinearIndex]->addr);
|
||||
|
||||
SparseGradient<S> input_sparse_grad({grad, indices, indices_size_});
|
||||
MultiThreadComputeParams<S> input_params;
|
||||
|
@ -444,20 +446,10 @@ bool SparseApplyFtrlCpuKernelMod::LaunchKernel(const std::vector<kernel::Address
|
|||
input_params.sparse_grad_ = input_sparse_grad;
|
||||
input_params.var_first_dim_size_ = var_first_dim_size_;
|
||||
input_params.var_outer_dim_size_ = var_outer_dim_size_;
|
||||
MultiThreadCompute<S>(ComputeFtrl<S>, &input_params, indices_size_);
|
||||
|
||||
// assign results back to outputs.
|
||||
auto ret = memcpy_s(var_out, outputs[kVarIndex]->size, var, inputs[kVarIndex]->size);
|
||||
if (ret != EOK) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', launch kernel error: memcpy failed. Error no: " << ret;
|
||||
}
|
||||
ret = memcpy_s(accum_out, outputs[kAccumIndex]->size, accum, inputs[kAccumIndex]->size);
|
||||
if (ret != EOK) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', launch kernel error: memcpy failed. Error no: " << ret;
|
||||
}
|
||||
ret = memcpy_s(linear_out, outputs[kLinearIndex]->size, linear, inputs[kLinearIndex]->size);
|
||||
if (ret != EOK) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', launch kernel error: memcpy failed. Error no: " << ret;
|
||||
if (indices_size_ < kSizeGap) {
|
||||
ComputeFtrl(&input_params, 0, indices_size_);
|
||||
} else {
|
||||
MultiThreadCompute<S>(ComputeFtrl<S>, &input_params, indices_size_);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -24,18 +24,16 @@ namespace {
|
|||
const size_t kZero = 0;
|
||||
constexpr size_t kTruncInputsNum = 1;
|
||||
constexpr size_t kTruncOutputsNum = 1;
|
||||
constexpr size_t kSizeGapMin = 1024;
|
||||
constexpr size_t kSizeGapMax = 102400;
|
||||
|
||||
template <typename T>
|
||||
void Trunc(const T *in0, T *out0, size_t start, size_t end) {
|
||||
for (size_t index = start; index < end; index++) {
|
||||
int ind = static_cast<int>(in0[index]);
|
||||
if (std::is_same_v<T, std::uint8_t>) {
|
||||
if constexpr ((std::is_same_v<T, uint8_t>) || (std::is_same_v<T, int8_t>) || (std::is_same_v<T, int32_t>)) {
|
||||
out0[index] = in0[index];
|
||||
} else {
|
||||
auto absvalue1 = (in0[index]) * (in0[index]);
|
||||
auto absvalue = sqrt(absvalue1);
|
||||
auto retp = floor(absvalue);
|
||||
out0[index] = (ind < 0) ? -retp : retp;
|
||||
} else if constexpr ((std::is_same_v<T, float>) || (std::is_same_v<T, double>)) {
|
||||
out0[index] = std::trunc(in0[index]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -68,9 +66,7 @@ bool TruncCpuKernelMod::Launch(const std::vector<kernel::AddressPtr> &inputs,
|
|||
bool ret = true;
|
||||
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kTruncInputsNum, kernel_name_);
|
||||
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kTruncOutputsNum, kernel_name_);
|
||||
if (dtype_ == kNumberTypeFloat16) {
|
||||
ret = LaunchKernel<float16>(inputs, outputs);
|
||||
} else if (dtype_ == kNumberTypeFloat32) {
|
||||
if (dtype_ == kNumberTypeFloat32) {
|
||||
ret = LaunchKernel<float>(inputs, outputs);
|
||||
} else if (dtype_ == kNumberTypeFloat64) {
|
||||
ret = LaunchKernel<double>(inputs, outputs);
|
||||
|
@ -92,13 +88,18 @@ bool TruncCpuKernelMod::LaunchKernel(const std::vector<AddressPtr> &inputs, cons
|
|||
const T *input_0_addr = reinterpret_cast<T *>(inputs[kZero]->addr);
|
||||
T *output_0_addr = reinterpret_cast<T *>(outputs[kZero]->addr);
|
||||
auto task = std::bind(Trunc<T>, input_0_addr, output_0_addr, std::placeholders::_1, std::placeholders::_2);
|
||||
ParallelLaunchAutoSearch(task, input_size_ * kTruncInputsNum, this, ¶llel_search_info_);
|
||||
if (input_size_ <= kSizeGapMin) {
|
||||
Trunc(input_0_addr, output_0_addr, 0, input_size_ * kTruncInputsNum);
|
||||
} else if (input_size_ <= kSizeGapMax) {
|
||||
ParallelLaunchAutoSearch(task, input_size_ * kTruncInputsNum, this, ¶llel_search_info_);
|
||||
} else {
|
||||
ParallelLaunch(task, input_size_ * kTruncInputsNum, 0, this);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
std::vector<KernelAttr> TruncCpuKernelMod::GetOpSupport() {
|
||||
static std::vector<KernelAttr> support_list = {
|
||||
KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
|
||||
KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
|
||||
KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
|
||||
KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8),
|
||||
|
|
|
@ -1869,8 +1869,8 @@ def grid_sample(input_x, grid, interpolation_mode='bilinear', padding_mode='zero
|
|||
Examples:
|
||||
>>> input_x = Tensor(np.arange(16).reshape((2, 2, 2, 2)).astype(np.float32))
|
||||
>>> grid = Tensor(np.arange(0.2, 1, 0.1).reshape((2, 2, 1, 2)).astype(np.float32))
|
||||
>>> output = F.grid_sample(input_x, grid, interpolation_mode='bilinear', padding_mode='zeros',
|
||||
... align_corners=True)
|
||||
>>> output = ops.grid_sample(input_x, grid, interpolation_mode='bilinear', padding_mode='zeros',
|
||||
... align_corners=True)
|
||||
>>> print(output)
|
||||
[[[[ 1.9 ]
|
||||
[ 2.1999998]]
|
||||
|
@ -1993,8 +1993,8 @@ def ctc_greedy_decoder(inputs, sequence_length, merge_repeated=True):
|
|||
>>> inputs = Tensor(np.array([[[0.6, 0.4, 0.2], [0.8, 0.6, 0.3]],
|
||||
... [[0.0, 0.6, 0.0], [0.5, 0.4, 0.5]]]), mindspore.float32)
|
||||
>>> sequence_length = Tensor(np.array([2, 2]), mindspore.int32)
|
||||
>>> decoded_indices, decoded_values, decoded_shape, log_probability = F.ctc_greedy_decoder(inputs,
|
||||
... sequence_length)
|
||||
>>> decoded_indices, decoded_values, decoded_shape, log_probability = ops.ctc_greedy_decoder(inputs,
|
||||
... sequence_length)
|
||||
>>> print(decoded_indices)
|
||||
[[0 0]
|
||||
[0 1]
|
||||
|
|
|
@ -2001,16 +2001,19 @@ class SparseCountSparseOutput(Primitive):
|
|||
RunTimeError: If indexes are not in bounds of the dense shape
|
||||
|
||||
Examples:
|
||||
>>> from mindspore.ops.operations.sparse_ops import SparseCountSparseOutput
|
||||
>>> indices = Tensor([[1, 2] ,[2, 3], [2, 1], [0, 2]], dtype=mstype.int64)
|
||||
>>> values = Tensor([0, 2, 8, 8], dtype=mstype.int64)
|
||||
>>> dense_shape = Tensor([4, 4], dtype=mstype.int64)
|
||||
>>> weights = Tensor([1, 2, 1, 0], dtype=mstype.int64)
|
||||
>>> sparse_count_sparse_output = ops.SparseCountSparseOutput()
|
||||
>>> sparse_count_sparse_output = SparseCountSparseOutput()
|
||||
>>> out = sparse_count_sparse_output(indices, values, dense_shape, weights)
|
||||
>>> print(out)
|
||||
(Tensor(shape=[4, 2], dtype=Int64, value= [[0, 8], [1, 0], [2, 2], [2, 8]]),
|
||||
Tensor(shape=[4], dtype=Int64, value= [0, 1, 2, 1]),
|
||||
Tensor(shape=[2], dtype=Int64, value= [4, 9])
|
||||
(Tensor(shape=[4, 2], dtype=Int64, value=
|
||||
[[0, 8],
|
||||
[1, 0],
|
||||
[2, 2],
|
||||
[2, 8]]), Tensor(shape=[4], dtype=Int64, value= [0, 1, 2, 1]), Tensor(shape=[2], dtype=Int64, value= [4, 9]))
|
||||
|
||||
Supported Platforms:
|
||||
``CPU``
|
||||
|
|
Loading…
Reference in New Issue