diff --git a/docs/api/api_python/mindspore.nn.rst b/docs/api/api_python/mindspore.nn.rst index 6057d7875aa..0e70a952dfd 100644 --- a/docs/api/api_python/mindspore.nn.rst +++ b/docs/api/api_python/mindspore.nn.rst @@ -234,6 +234,7 @@ Dropout层 mindspore.nn.BCEWithLogitsLoss mindspore.nn.CosineEmbeddingLoss mindspore.nn.CrossEntropyLoss + mindspore.nn.CTCLoss mindspore.nn.DiceLoss mindspore.nn.FocalLoss mindspore.nn.GaussianNLLLoss diff --git a/docs/api/api_python/nn/mindspore.nn.CTCLoss.rst b/docs/api/api_python/nn/mindspore.nn.CTCLoss.rst new file mode 100644 index 00000000000..704d78efd0e --- /dev/null +++ b/docs/api/api_python/nn/mindspore.nn.CTCLoss.rst @@ -0,0 +1,32 @@ +mindspore.nn.CTCLoss +==================== + +.. py:class:: mindspore.nn.CTCLoss(blank=0, reduction='mean', zero_infinity=False) + + CTCLoss损失函数。 + + 关于CTCLoss算法详细介绍,请参考 `Connectionist Temporal Classification: Labeling Unsegmented Sequence Data withRecurrent Neural Networks `_ 。 + + 参数: + - **blank** (int) - 空白标签。默认值:0。 + - **reduction** (str) - 指定输出结果的计算方式。可选值为"none"、"mean"或"sum"。默认值:"mean"。 + - **zero_infinity** (bool) - 是否设置无限损失和相关梯度为零。默认值:"False"。 + + 输入: + - **log_probs** (Tensor) - 输入Tensor,shape :math:`(T, N, C)` 。其中T表示输入长度,N表示批次大小,C是分类数。 + - **target** (Tensor) - 目标Tensor,shape :math:`(N, S)` 。其中S表示最大目标长度。 + - **input_lengths** (Union(Tuple, Tensor)) - shape为N的Tensor或tuple。表示输入长度。 + - **target_lengths** (Union(Tuple, Tensor)) - shape为N的Tensor或tuple。表示目标长度。 + + 输出: + - **neg_log_likelihood** (Tensor) - 对每一个输入节点可微调的损失值。 + + 异常: + - **TypeError** - `zero_infinity` 不是布尔值, `reduction` 不是字符串。 + - **TypeError** - `log_probs` 的数据类型不是float或bouble。 + - **TypeError** - `targets` , `input_lengths` 或 `target_lengths` 数据类型不是int32或int64。 + - **ValueError** - `reduction` 不为"none","mean"或"sum"。 + - **ValueError** - `targets` , `input_lengths` 或 `target_lengths` 的数据类型是不同的。 + - **ValueError** - `blank` 值不介于0到C之间。 + - **ValueError** - `input_lengths` 的值大于C。 + - **ValueError** - `target_lengths[i]` 不在值不介于0到 `input_length[i]` 之间。 diff --git a/docs/api/api_python_en/mindspore.nn.rst b/docs/api/api_python_en/mindspore.nn.rst index b8898a4b635..e3f4c922cbf 100644 --- a/docs/api/api_python_en/mindspore.nn.rst +++ b/docs/api/api_python_en/mindspore.nn.rst @@ -234,6 +234,7 @@ Loss Function mindspore.nn.BCEWithLogitsLoss mindspore.nn.CosineEmbeddingLoss mindspore.nn.CrossEntropyLoss + mindspore.nn.CTCLoss mindspore.nn.DiceLoss mindspore.nn.FocalLoss mindspore.nn.GaussianNLLLoss diff --git a/mindspore/python/mindspore/nn/loss/__init__.py b/mindspore/python/mindspore/nn/loss/__init__.py index 05a42150290..f04161ddd9e 100644 --- a/mindspore/python/mindspore/nn/loss/__init__.py +++ b/mindspore/python/mindspore/nn/loss/__init__.py @@ -20,12 +20,13 @@ It shows how well the model works on a dataset and the optimization target which """ from __future__ import absolute_import -from mindspore.nn.loss.loss import LossBase, L1Loss, MSELoss, SmoothL1Loss, SoftMarginLoss, FocalLoss, \ +from mindspore.nn.loss.loss import LossBase, L1Loss, CTCLoss, MSELoss, SmoothL1Loss, SoftMarginLoss, FocalLoss, \ SoftmaxCrossEntropyWithLogits, BCELoss, MultiMarginLoss, CosineEmbeddingLoss, \ SampledSoftmaxLoss, DiceLoss, BCEWithLogitsLoss, MultiClassDiceLoss, MultilabelMarginLoss, \ RMSELoss, MAELoss, HuberLoss, CrossEntropyLoss, NLLLoss, KLDivLoss, MarginRankingLoss, GaussianNLLLoss -__all__ = ['LossBase', 'L1Loss', 'MSELoss', 'SmoothL1Loss', 'SoftMarginLoss', 'FocalLoss', + +__all__ = ['LossBase', 'L1Loss', 'CTCLoss', 'MSELoss', 'SmoothL1Loss', 'SoftMarginLoss', 'FocalLoss', 'SoftmaxCrossEntropyWithLogits', 'BCELoss', 'BCEWithLogitsLoss', 'MultiMarginLoss', 'CosineEmbeddingLoss', 'SampledSoftmaxLoss', 'DiceLoss', 'MultiClassDiceLoss', 'MultilabelMarginLoss', 'RMSELoss', 'MAELoss', 'HuberLoss', 'CrossEntropyLoss', 'NLLLoss', 'KLDivLoss', 'MarginRankingLoss', diff --git a/mindspore/python/mindspore/nn/loss/loss.py b/mindspore/python/mindspore/nn/loss/loss.py index 071b515b2fc..0ec70f343b7 100644 --- a/mindspore/python/mindspore/nn/loss/loss.py +++ b/mindspore/python/mindspore/nn/loss/loss.py @@ -25,6 +25,7 @@ from mindspore.ops import operations as P from mindspore.ops.operations.nn_ops import MultiMarginLoss as MultiMarginLossOp from mindspore.ops.operations.nn_ops import MultilabelMarginLoss as MultilabelMarginLossOp from mindspore.ops.operations.nn_ops import TripletMarginLoss as TripletMarginLossOp +from mindspore.ops.operations.nn_ops import CTCLossV2 from mindspore.ops import functional as F from mindspore import nn from mindspore.ops.primitive import constexpr @@ -2242,6 +2243,99 @@ class KLDivLoss(LossBase): return F.kl_div(logits, labels, self.reduction) +class CTCLoss(LossBase): + """ + Calculates the CTC (Connectionist Temporal Classification) loss. + + For the CTC algorithm, refer to `Connectionist Temporal Classification: Labeling Unsegmented Sequence Data with + Recurrent Neural Networks `_ . + + Args: + blank (int): The blank label. Default: 0. + reduction (string): Apply specific reduction method to the output: 'none', 'mean', or 'sum'. Default: 'mean'. + zero_infinity (bool): Whether to set infinite loss and correlation gradient to zero. Default: False. + + Inputs: + - **log_probs** (Tensor) - A tensor of shape (T, N, C) or (T, C), where T is input length, N is batch size and + C is number of classes (including blank). + - **targets** (Tensor) - A tensor of shape (N, S) or (sum( `target_lengths` )), where S is max target length, + means the target sequences. + - **input_lengths** (Union(Tuple, Tensor)) - A tuple or Tensor of shape(N), or a number. + It means the lengths of the input. + - **target_lengths** (Union(Tuple, Tensor)) - A tuple or Tensor of shape(N), or a number. + It means the lengths of the target. + + Outputs: + - **neg_log_likelihood** (Tensor) - A loss value which is differentiable with respect to each input node. + + Raises: + TypeError: If `zero_infinity` is not a bool, `reduction` is not string. + TypeError: If the dtype of `log_probs` is not float or double. + TypeError: If the dtype of `targets`, `input_lengths` or `target_lengths` is not int32 or int64. + ValueError: If `reduction` is not "none", "mean" or "sum". + ValueError: If the types of `targets`, `input_lengths` or `target_lengths` are different. + ValueError: If the value of `blank` is not in range [0, C). + ValueError: If any value of `input_lengths` is larger than C. + ValueError: If any target_lengths[i] is not in range [0, input_length[i]]. + + Supported Platforms: + ``Ascend`` ``CPU`` + + Examples: + >>> import numpy as np + >>> from mindspore import Tensor + >>> from mindspore import dtype as mstype + >>> from mindspore.nn.loss import CTCLoss + >>> T = 5 # Input sequence length + >>> C = 2 # Number of classes + >>> N = 2 # Batch size + >>> S = 3 # Target sequence length of longest target in batch (padding length) + >>> S_min = 2 # Minimum target length, for demonstration purposes + >>> arr = np.arange(T*N*C).reshape((T, N, C)) + >>> ms_input = Tensor(arr, dtype=mstype.float32) + >>> input_lengths = np.full(shape=(N), fill_value=T) + >>> input_lengths = Tensor(input_lengths, dtype=mstype.int32) + >>> target_lengths = np.full(shape=(N), fill_value=S_min) + >>> target_lengths = Tensor(target_lengths, dtype=mstype.int32) + >>> target = np.random.randint(1, C, size=(N, S)) + >>> target = Tensor(target, dtype=mstype.int32) + >>> ctc_loss = CTCLoss(blank=0, reduction='none', zero_infinity=False) + >>> loss = ctc_loss(ms_input, target, input_lengths, target_lengths) + >>> print(loss) + Tensor(shape=[2], dtype=Float32, value= [-4.57949715e+001, -5.57949677e+001]) + >>> arr = np.arange(T*C).reshape((T, C)) + >>> ms_input = Tensor(arr, dtype=mstype.float32) + >>> input_lengths = T + >>> target_lengths = S_min + >>> target = np.random.randint(1, C, size=(S_min,)) + >>> target = Tensor(target, dtype=mstype.int32) + >>> ctc_loss = CTCLoss(blank=0, reduction='none', zero_infinity=False) + >>> loss = ctc_loss(ms_input, target, input_lengths, target_lengths) + >>> print(loss) + Tensor(shape=[1], dtype=Float32, value= [-2.57949677e+001]) + """ + + def __init__(self, blank=0, reduction='mean', zero_infinity=False): + super().__init__(reduction) + self.ctcloss = CTCLossV2(blank=blank, reduction='none', zero_infinity=zero_infinity) + + def construct(self, log_probs, targets, input_lengths, target_lengths): + if len(log_probs.shape) == 2: + n, c = log_probs.shape + log_probs = log_probs.reshape((n, 1, c)) + targets = targets.reshape(1, targets.shape[0]) + if isinstance(input_lengths, int): + input_lengths = Tensor([input_lengths], mstype.int32) + else: + raise ValueError("The dtype of input_lengths should be int32 or int64.") + if isinstance(target_lengths, int): + target_lengths = Tensor([target_lengths], mstype.int32) + else: + raise ValueError("The dtype of target_lengths should be int32 or int64.") + neg_log_hood, _ = self.ctcloss(log_probs, targets, input_lengths, target_lengths) + return self.get_loss(neg_log_hood) + + class GaussianNLLLoss(LossBase): r"""Gaussian negative log likelihood loss. diff --git a/tests/st/nn/test_ctc_loss.py b/tests/st/nn/test_ctc_loss.py new file mode 100644 index 00000000000..4ad8e77e9f1 --- /dev/null +++ b/tests/st/nn/test_ctc_loss.py @@ -0,0 +1,106 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import numpy as np +import pytest + +import mindspore as ms +import mindspore.nn as nn +from mindspore import Tensor +from mindspore import dtype as mstype + + +class CTCLossNet(nn.Cell): + def __init__(self, reduction="none"): + super(CTCLossNet, self).__init__() + self.ctcloss = nn.CTCLoss(blank=0, reduction=reduction, zero_infinity=False) + + def construct(self, log_probs, target, input_length, target_length): + return self.ctcloss(log_probs, target, input_length, target_length) + + +@pytest.mark.level0 +@pytest.mark.platform_x86_cpu +@pytest.mark.platform_arm_cpu +@pytest.mark.platform_arm_ascend_training +@pytest.mark.platform_x86_ascend_training +@pytest.mark.env_onecard +@pytest.mark.parametrize('mode', [ms.GRAPH_MODE, ms.PYNATIVE_MODE]) +@pytest.mark.parametrize('reduct', ["none", "mean", "sum"]) +def test_ctc_loss_tnc(mode, reduct): + """ + Feature: test CTCLoss op with input shape (T, N, C). + Description: Verify the result of CTCLoss. + Expectation: expect correct forward result. + """ + ms.set_context(mode=mode) + loss = CTCLossNet(reduction=reduct) + + t = 10 # Input sequence length + c = 4 # Number of classes + n = 2 # Batch size + s = 5 # Target sequence length of longest target in batch (padding length) + s_min = 3 # Minimum target length, for demonstration purposes + arr = np.arange(t * n * c).reshape((t, n, c)) + inputs = Tensor(arr, dtype=mstype.float32) + input_lengths = np.full(shape=(n), fill_value=t) + input_lengths = Tensor(input_lengths, dtype=mstype.int32) + target_lengths = np.full(shape=(n), fill_value=s_min) + target_lengths = Tensor(target_lengths, dtype=mstype.int32) + arr = np.arange(n * s).reshape((n, s)) + targets = Tensor(arr, dtype=mstype.int32) + + output = loss(inputs, targets, input_lengths, target_lengths) + + if reduct == "none": + expect_output = np.array([-3.78184143e+002, -4.60606476e+002]) + elif reduct == "mean": + expect_output = np.array([-419.395]) + else: + expect_output = np.array([-838.791]) + assert np.allclose(output.asnumpy(), expect_output) + + +@pytest.mark.level0 +@pytest.mark.platform_x86_cpu +@pytest.mark.platform_arm_cpu +@pytest.mark.platform_arm_ascend_training +@pytest.mark.platform_x86_ascend_training +@pytest.mark.env_onecard +@pytest.mark.parametrize('mode', [ms.GRAPH_MODE, ms.PYNATIVE_MODE]) +@pytest.mark.parametrize('reduct', ["none", "mean", "sum"]) +def test_ctc_loss_tc(mode, reduct): + """ + Feature: test CTCLoss op with input shape (T, C). + Description: Verify the result of CTCLoss. + Expectation: expect correct forward result. + """ + ms.set_context(mode=mode) + loss = CTCLossNet(reduction=reduct) + + t = 10 # Input sequence length + c = 4 # Number of classes + s_min = 3 # Minimum target length, for demonstration purposes + arr = np.arange(t * c).reshape((t, c)) + inputs = Tensor(arr, dtype=mstype.float32) + input_lengths = t + target_lengths = s_min + arr = np.arange(s_min).reshape((s_min,)) + targets = Tensor(arr, dtype=mstype.int32) + + output = loss(inputs, targets, input_lengths, target_lengths) + + expect_output = np.array([-1.98184158e+002]) + assert np.allclose(output.asnumpy(), expect_output) diff --git a/tests/ut/python/nn/test_loss.py b/tests/ut/python/nn/test_loss.py index 7afda1d4efe..339a75fbdac 100644 --- a/tests/ut/python/nn/test_loss.py +++ b/tests/ut/python/nn/test_loss.py @@ -308,6 +308,29 @@ def test_margin_ranking_loss(): loss(input1, input2, target) +def test_ctc_loss(): + """ + Feature: Test CTCLoss. + Description: Test CTCLoss functional. + Expectation: Success. + """ + t = 10 # Input sequence length + c = 4 # Number of classes + n = 2 # Batch size + s = 5 # Target sequence length of longest target in batch + s_min = 3 # Minimum target length, for demonstration purposes + arr = np.random.randn(t * n * c).reshape((t, n, c)) + inputs = Tensor(arr, dtype=mstype.float32) + input_lengths = np.full(shape=n, fill_value=t) + input_lengths = Tensor(input_lengths, dtype=mstype.int32) + target_lengths = np.full(shape=n, fill_value=s_min) + target_lengths = Tensor(target_lengths, dtype=mstype.int32) + target = np.random.randint(1, c, size=(n, s)) + target = Tensor(target, dtype=mstype.int32) + ctc_loss = nn.CTCLoss(blank=0, reduction='none', zero_infinity=False) + ctc_loss(inputs, target, input_lengths, target_lengths) + + def test_gaussian_nll_loss(): """ Feature: Test GaussianNLLLoss.