forked from mindspore-Ecosystem/mindspore
bugs fix
This commit is contained in:
parent
aeb3aa915a
commit
0af87eb9cc
|
@ -3,7 +3,7 @@ mindspore.nn.AdaSumByDeltaWeightWrapCell
|
|||
|
||||
.. py:class:: mindspore.nn.AdaSumByDeltaWeightWrapCell(optimizer)
|
||||
|
||||
Adaptive Summation (AdaSum)算法的实现,根据更新前后的参数差计算。
|
||||
Adaptive Summation (AdaSum)算法的实现,根据更新前后的参数差计算。应用于semi_auto_parallel/auto_parallel模式。
|
||||
|
||||
请参阅论文 `AdaSum: Scaling Distributed Training with Adaptive Summation <https://arxiv.org/abs/2006.02924>`_。
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@ mindspore.nn.AdaSumByGradWrapCell
|
|||
|
||||
.. py:class:: mindspore.nn.AdaSumByGradWrapCell(optimizer)
|
||||
|
||||
Adaptive Summation (AdaSum)算法的实现,根据梯度计算。
|
||||
Adaptive Summation (AdaSum)算法的实现,根据梯度计算。应用于semi_auto_parallel/auto_parallel模式。
|
||||
|
||||
请参阅论文 `AdaSum: Scaling Distributed Training with Adaptive Summation <https://arxiv.org/abs/2006.02924>`_。
|
||||
|
||||
|
|
|
@ -484,6 +484,14 @@ Status MatMulBase::GenerateStrategiesNotPower2(int64_t stage_id, size_t dev_num_
|
|||
}
|
||||
}
|
||||
strategy_cost_.clear();
|
||||
// add the repeated strategy
|
||||
auto repeated_stra_arrays{inputs_shape_};
|
||||
for (auto &stra_array : repeated_stra_arrays) {
|
||||
std::fill(stra_array.begin(), stra_array.end(), 1);
|
||||
}
|
||||
StrategyPtr repeated_stra = std::make_shared<Strategy>(stage_id, repeated_stra_arrays);
|
||||
sp_vector.push_back(repeated_stra);
|
||||
|
||||
for (auto &sp : sp_vector) {
|
||||
if (SetCostUnderStrategy(sp) == FAILED) {
|
||||
MS_LOG(WARNING) << name_ << " : Calculating cost for strategy failed.";
|
||||
|
|
|
@ -1470,6 +1470,13 @@ Status GenerateStrategiesForIndependentInputs(int64_t stage_id, const Shapes &in
|
|||
}
|
||||
}
|
||||
}
|
||||
// add the repeated strategy
|
||||
auto repeated_stra_arrays{splittable_inputs};
|
||||
for (auto &stra_array : repeated_stra_arrays) {
|
||||
std::fill(stra_array.begin(), stra_array.end(), 1);
|
||||
}
|
||||
StrategyPtr repeated_stra = std::make_shared<Strategy>(stage_id, repeated_stra_arrays);
|
||||
sp_vector->push_back(repeated_stra);
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
|
|
|
@ -380,6 +380,18 @@ def _parallel_check():
|
|||
class AdaSumByGradWrapCell(Cell):
|
||||
r"""
|
||||
Enable the adasum in "auto_parallel/semi_auto_parallel" mode.
|
||||
The implementation of the Adaptive Summation (AdaSum) algorithm is calculated by gradients.
|
||||
See the paper `AdaSum: Scaling Distributed Training with Adaptive Summation <https://arxiv.org/abs/2006.02924>`_.
|
||||
|
||||
.. math::
|
||||
\begin{array}{ll}
|
||||
w_{t+1}=w_{t} - \alpha \cdot Adasum(g_{1}, g_{2}) \\
|
||||
w_{t+1}=w_{t} - \alpha \cdot [(1 - \frac{g_2^{T}\cdot g_1}{2\cdot \left \| g_1 \right \|^2 })\cdot g_1 +
|
||||
(1 - \frac{g_1^{T}\cdot g_2}{2\cdot \left \| g_2 \right \|^2 })\cdot g_2] \\
|
||||
\end{array}
|
||||
|
||||
In this implementation, :math:`g` represents the gradient of the weights,
|
||||
and the subscripts represent different devices in the data-parallel dimension.
|
||||
|
||||
Note:
|
||||
When using AdaSum, the number of traning cards needs to be a power of 2 and at least 16 cards are required.
|
||||
|
@ -433,6 +445,19 @@ class AdaSumByGradWrapCell(Cell):
|
|||
class AdaSumByDeltaWeightWrapCell(Cell):
|
||||
r"""
|
||||
Enable the adasum in "auto_parallel/semi_auto_parallel" mode.
|
||||
The implementation of the Adaptive Summation (AdaSum) algorithm is calculated based on the difference of weights
|
||||
before and after the updating of optimizer.
|
||||
See the paper `AdaSum: Scaling Distributed Training with Adaptive Summation <https://arxiv.org/abs/2006.02924>`_.
|
||||
|
||||
.. math::
|
||||
\begin{array}{ll}
|
||||
w_{t+1}=w_{t} - \alpha \cdot Adasum(g_{1}, g_{2}) \\
|
||||
w_{t+1}=w_{t} - \alpha \cdot [(1 - \frac{g_2^{T}\cdot g_1}{2\cdot \left \| g_1 \right \|^2 })\cdot g_1 +
|
||||
(1 - \frac{g_1^{T}\cdot g_2}{2\cdot \left \| g_2 \right \|^2 })\cdot g_2] \\
|
||||
\end{array}
|
||||
|
||||
In this implementation, :math:`g` represents the weight difference before and after the updating of optimizer,
|
||||
and the subscripts represent different devices in the data parallel dimension.
|
||||
|
||||
Note:
|
||||
When using AdaSum, the number of traning cards needs to be a power of 2 and at least 16 cards are required.
|
||||
|
|
Loading…
Reference in New Issue