diff --git a/mindspore/nn/probability/bijector/exp.py b/mindspore/nn/probability/bijector/exp.py
index 0af3dc7df95..72c0cb509ea 100644
--- a/mindspore/nn/probability/bijector/exp.py
+++ b/mindspore/nn/probability/bijector/exp.py
@@ -35,7 +35,7 @@ class Exp(PowerTransform):
         >>> import mindspore.nn as nn
         >>> from mindspore import Tensor
         >>> import mindspore.context as context
-        >>> context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
+        >>> context.set_context(mode=context.GRAPH_MODE)
         >>>
         >>> # To initialize an Exp bijector.
         >>> exp_bijector = nn.probability.bijector.Exp()
diff --git a/mindspore/nn/probability/bijector/power_transform.py b/mindspore/nn/probability/bijector/power_transform.py
index b9c65fc48e0..4e070fef819 100644
--- a/mindspore/nn/probability/bijector/power_transform.py
+++ b/mindspore/nn/probability/bijector/power_transform.py
@@ -81,10 +81,14 @@ class PowerTransform(Bijector):
         self.pow = P.Pow()
         self.dtypeop = P.DType()
         self.cast = P.Cast()
+        self.equal_base = P.Equal()
         self.exp = exp_generic
         self.expm1 = P.Expm1()
+        self.fill = P.Fill()
         self.log = log_generic
         self.log1p = P.Log1p()
+        self.select_base = P.Select()
+        self.shape = P.Shape()
 
     @property
     def power(self):
@@ -99,21 +103,44 @@ class PowerTransform(Bijector):
 
 
     def _forward(self, x):
+        """
+        Evaluate the forward mapping.
+        """
         x = self._check_value_dtype(x)
         power_local = self.cast_param_by_value(x, self.power)
-        if power_local == 0:
-            forward_v = self.exp(x)
-        else:
-            forward_v = self.exp(self.log1p(x * power_local) / power_local)
+
+        # broad cast the value of x and power
+        ones = self.fill(self.dtypeop(power_local), self.shape(x + power_local), 1.)
+        power_local = power_local * ones
+        x = x * ones
+        safe_power = self.select_base(self.equal_base(power_local, 0.),
+                                      ones,
+                                      power_local)
+
+        forward_v = self.select_base(self.equal_base(power_local, 0.),
+                                     self.exp(x),
+                                     self.exp(self.log1p(x * safe_power) / safe_power))
         return forward_v
 
     def _inverse(self, y):
+        """
+        Evaluate the inverse mapping.
+        """
         y = self._check_value_dtype(y)
         power_local = self.cast_param_by_value(y, self.power)
-        if power_local == 0:
-            inverse_v = self.log(y)
-        else:
-            inverse_v = self.expm1(self.log(y) * power_local) / power_local
+
+        # broad cast the value of x and power
+        ones = self.fill(self.dtypeop(power_local), self.shape(y + power_local), 1.)
+        power_local = power_local * ones
+        y = y * ones
+        safe_power = self.select_base(self.equal_base(power_local, 0.),
+                                      ones,
+                                      power_local)
+
+        inverse_v = self.select_base(self.equal_base(power_local, 0.),
+                                     self.log(y),
+                                     self.expm1(self.log(y) * safe_power) / safe_power)
+
         return inverse_v
 
     def _forward_log_jacobian(self, x):
@@ -130,10 +157,16 @@ class PowerTransform(Bijector):
         """
         x = self._check_value_dtype(x)
         power_local = self.cast_param_by_value(x, self.power)
-        if power_local == 0:
-            forward_log_j = x
-        else:
-            forward_log_j = (1. / power_local - 1) * self.log1p(x * power_local)
+
+        # broad cast the value of x and power
+        ones = self.fill(self.dtypeop(power_local), self.shape(x + power_local), 1.)
+        power_local = power_local * ones
+        x = x * ones
+
+        forward_log_j = self.select_base(self.equal_base(power_local, 0.),
+                                         x,
+                                         (1. / power_local - 1) * self.log1p(x * power_local))
+
         return forward_log_j
 
     def _inverse_log_jacobian(self, y):
diff --git a/mindspore/nn/probability/distribution/beta.py b/mindspore/nn/probability/distribution/beta.py
index 7d1678d4124..5868b40ee91 100644
--- a/mindspore/nn/probability/distribution/beta.py
+++ b/mindspore/nn/probability/distribution/beta.py
@@ -56,7 +56,7 @@ class Beta(Distribution):
         >>> # In this case, `concentration1` and `concentration0` must be passed in through arguments.
         >>> b2 = msd.Beta(dtype=mindspore.float32)
         >>> # Here are some tensors used below for testing
-        >>> value = Tensor([0.1, 0.5, 1.5], dtype=mindspore.float32)
+        >>> value = Tensor([0.1, 0.5, 0.8], dtype=mindspore.float32)
         >>> concentration1_a = Tensor([2.0], dtype=mindspore.float32)
         >>> concentration0_a = Tensor([2.0, 2.0, 2.0], dtype=mindspore.float32)
         >>> concentration1_b = Tensor([1.0], dtype=mindspore.float32)
@@ -72,15 +72,15 @@ class Beta(Distribution):
         >>> # by replacing 'prob' by the name of the function
         >>> ans = b1.prob(value)
         >>> print(ans)
-        [0.43740022 1.8750011         nan]
+        [0.43740022 1.8750011  0.30720013]
         >>> # Evaluate with respect to the distribution b.
         >>> ans = b1.prob(value, concentration1_b, concentration0_b)
         >>> print(ans)
-        [0.99999964 1.0606599         nan]
+        [0.99999964 1.0606599  0.39999983]
         >>> # `concentration1` and `concentration0` must be passed in during function calls
         >>> ans = b2.prob(value, concentration1_a, concentration0_a)
         >>> print(ans)
-        [0.5400001 1.5000001       nan]
+        [0.5400001  1.5000001  0.96000004]
         >>> # Functions `mean`, `sd`, `mode`, `var`, and `entropy` have the same arguments.
         >>> # Args:
         >>> #     concentration1 (Tensor): the concentration1 of the distribution. Default: self._concentration1.
diff --git a/mindspore/nn/probability/distribution/cauchy.py b/mindspore/nn/probability/distribution/cauchy.py
index f72fdfc7e9a..cc5536a8dff 100644
--- a/mindspore/nn/probability/distribution/cauchy.py
+++ b/mindspore/nn/probability/distribution/cauchy.py
@@ -168,6 +168,8 @@ class Cauchy(Distribution):
         self.tan = P.Tan()
         self.uniform = C.uniform
 
+        self.entropy_const = np.log(4 * np.pi)
+
 
     def extend_repr(self):
         if self.is_scalar_batch:
@@ -228,7 +230,7 @@ class Cauchy(Distribution):
             H(X) = \log(4 * \Pi * scale)
         """
         loc, scale = self._check_param_type(loc, scale)
-        return self.log(4 * np.pi * scale)
+        return self.log(scale) + self.entropy_const
 
     def _log_prob(self, value, loc=None, scale=None):
         r"""
diff --git a/mindspore/nn/probability/distribution/gamma.py b/mindspore/nn/probability/distribution/gamma.py
index 2795a5ce952..4741d36e35c 100644
--- a/mindspore/nn/probability/distribution/gamma.py
+++ b/mindspore/nn/probability/distribution/gamma.py
@@ -71,7 +71,7 @@ class Gamma(Distribution):
         >>> # Examples of `prob`.
         >>> # Similar calls can be made to other probability functions
         >>> # by replacing 'prob' by the name of the function
-        >>> # ans = g1.prob(value)
+        >>> ans = g1.prob(value)
         >>> print(ans)
         [0.58610016 0.0429392  0.00176953]
         >>> # Evaluate with respect to the distribution b.
diff --git a/mindspore/nn/probability/distribution/transformed_distribution.py b/mindspore/nn/probability/distribution/transformed_distribution.py
index 5e2f2f70166..d7e4bc3da30 100644
--- a/mindspore/nn/probability/distribution/transformed_distribution.py
+++ b/mindspore/nn/probability/distribution/transformed_distribution.py
@@ -63,8 +63,7 @@ class TransformedDistribution(Distribution):
         >>> # To initialize a transformed distribution
         >>> # using a Normal distribution as the base distribution,
         >>> # and an Exp bijector as the bijector function.
-        >>> trans_dist = msd.TransformedDistribution(msb.Exp(),
-        >>>                                          msd.Normal(0.0, 1.0))
+        >>> trans_dist = msd.TransformedDistribution(msb.Exp(), msd.Normal(0.0, 1.0))
         >>>
         >>> value = Tensor([1.0, 2.0, 3.0], dtype=mindspore.float32)
         >>> prob = trans_dist.prob(value)