[feat] [assistant] [I3J6U8] Add a new audio operator DetectPitchFrequency

2021-09-29 15:12:58 +08:00 · 2021-09-29 15:12:58 +08:00 · b7e6520e62
parent 6d1a73b4c1
commit b7e6520e62
17 changed files with 892 additions and 19 deletions
--- a/mindspore/ccsrc/minddata/dataset/api/audio.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/audio.cc
@ -28,6 +28,7 @@
 #include "minddata/dataset/audio/ir/kernels/contrast_ir.h"
 #include "minddata/dataset/audio/ir/kernels/dc_shift_ir.h"
 #include "minddata/dataset/audio/ir/kernels/deemph_biquad_ir.h"
+#include "minddata/dataset/audio/ir/kernels/detect_pitch_frequency_ir.h"
 #include "minddata/dataset/audio/ir/kernels/equalizer_biquad_ir.h"
 #include "minddata/dataset/audio/ir/kernels/fade_ir.h"
 #include "minddata/dataset/audio/ir/kernels/frequency_masking_ir.h"
@ -218,6 +219,30 @@ std::shared_ptr<TensorOperation> DeemphBiquad::Parse() {
  return std::make_shared<DeemphBiquadOperation>(data_->sample_rate_);
 }

+// DetectPitchFrequency Transform Operation.
+struct DetectPitchFrequency::Data {
+  Data(int32_t sample_rate, float frame_time, int32_t win_length, int32_t freq_low, int32_t freq_high)
+      : sample_rate_(sample_rate),
+        frame_time_(frame_time),
+        win_length_(win_length),
+        freq_low_(freq_low),
+        freq_high_(freq_high) {}
+  int32_t sample_rate_;
+  float frame_time_;
+  int32_t win_length_;
+  int32_t freq_low_;
+  int32_t freq_high_;
+};
+
+DetectPitchFrequency::DetectPitchFrequency(int32_t sample_rate, float frame_time, int32_t win_length, int32_t freq_low,
+                                           int32_t freq_high)
+    : data_(std::make_shared<Data>(sample_rate, frame_time, win_length, freq_low, freq_high)) {}
+
+std::shared_ptr<TensorOperation> DetectPitchFrequency::Parse() {
+  return std::make_shared<DetectPitchFrequencyOperation>(data_->sample_rate_, data_->frame_time_, data_->win_length_,
+                                                         data_->freq_low_, data_->freq_high_);
+}
+
 // EqualizerBiquad Transform Operation.
 struct EqualizerBiquad::Data {
  Data(int32_t sample_rate, float center_freq, float gain, float Q)
--- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/audio/kernels/ir/bindings.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/audio/kernels/ir/bindings.cc
@ -32,6 +32,7 @@
 #include "minddata/dataset/audio/ir/kernels/contrast_ir.h"
 #include "minddata/dataset/audio/ir/kernels/dc_shift_ir.h"
 #include "minddata/dataset/audio/ir/kernels/deemph_biquad_ir.h"
+#include "minddata/dataset/audio/ir/kernels/detect_pitch_frequency_ir.h"
 #include "minddata/dataset/audio/ir/kernels/equalizer_biquad_ir.h"
 #include "minddata/dataset/audio/ir/kernels/fade_ir.h"
 #include "minddata/dataset/audio/ir/kernels/frequency_masking_ir.h"
@ -186,6 +187,19 @@ PYBIND_REGISTER(
      }));
  }));

+PYBIND_REGISTER(DetectPitchFrequencyOperation, 1, ([](const py::module *m) {
+                  (void)py::class_<audio::DetectPitchFrequencyOperation, TensorOperation,
+                                   std::shared_ptr<audio::DetectPitchFrequencyOperation>>(
+                    *m, "DetectPitchFrequencyOperation")
+                    .def(py::init([](int32_t sample_rate, float frame_time, int32_t win_length, int32_t freq_low,
+                                     int32_t freq_high) {
+                      auto detect_pitch_frequency = std::make_shared<audio::DetectPitchFrequencyOperation>(
+                        sample_rate, frame_time, win_length, freq_low, freq_high);
+                      THROW_IF_ERROR(detect_pitch_frequency->ValidateParams());
+                      return detect_pitch_frequency;
+                    }));
+                }));
+
 PYBIND_REGISTER(EqualizerBiquadOperation, 1, ([](const py::module *m) {
                  (void)py::class_<audio::EqualizerBiquadOperation, TensorOperation,
                                   std::shared_ptr<audio::EqualizerBiquadOperation>>(*m, "EqualizerBiquadOperation")
--- a/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/CMakeLists.txt
@ -14,6 +14,7 @@ add_library(audio-ir-kernels OBJECT
        contrast_ir.cc
        dc_shift_ir.cc
        deemph_biquad_ir.cc
+        detect_pitch_frequency_ir.cc
        equalizer_biquad_ir.cc
        fade_ir.cc
        frequency_masking_ir.cc
--- a/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/detect_pitch_frequency_ir.cc
+++ b/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/detect_pitch_frequency_ir.cc
@ -0,0 +1,61 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "minddata/dataset/audio/ir/kernels/detect_pitch_frequency_ir.h"
+
+#include "minddata/dataset/audio/ir/validators.h"
+#include "minddata/dataset/audio/kernels/detect_pitch_frequency_op.h"
+
+namespace mindspore {
+namespace dataset {
+namespace audio {
+// DetectPitchFrequencyOperation
+DetectPitchFrequencyOperation::DetectPitchFrequencyOperation(int32_t sample_rate, float frame_time, int32_t win_length,
+                                                             int32_t freq_low, int32_t freq_high)
+    : sample_rate_(sample_rate),
+      frame_time_(frame_time),
+      win_length_(win_length),
+      freq_low_(freq_low),
+      freq_high_(freq_high) {}
+
+Status DetectPitchFrequencyOperation::ValidateParams() {
+  RETURN_IF_NOT_OK(ValidateScalarNotZero("DetectPitchFrequency", "sample_rate", sample_rate_));
+  RETURN_IF_NOT_OK(ValidateFloatScalarPositive("DetectPitchFrequency", "frame_time", frame_time_));
+  RETURN_IF_NOT_OK(ValidateIntScalarPositive("DetectPitchFrequency", "win_length", win_length_));
+  RETURN_IF_NOT_OK(ValidateIntScalarPositive("DetectPitchFrequency", "freq_low", freq_low_));
+  RETURN_IF_NOT_OK(ValidateIntScalarPositive("DetectPitchFrequency", "freq_high", freq_high_));
+  return Status::OK();
+}
+
+std::shared_ptr<TensorOp> DetectPitchFrequencyOperation::Build() {
+  std::shared_ptr<DetectPitchFrequencyOp> tensor_op =
+    std::make_shared<DetectPitchFrequencyOp>(sample_rate_, frame_time_, win_length_, freq_low_, freq_high_);
+  return tensor_op;
+}
+
+Status DetectPitchFrequencyOperation::to_json(nlohmann::json *out_json) {
+  nlohmann::json args;
+  args["sample_rate"] = sample_rate_;
+  args["frame_time"] = frame_time_;
+  args["win_length"] = win_length_;
+  args["freq_low"] = freq_low_;
+  args["freq_high"] = freq_high_;
+  *out_json = args;
+  return Status::OK();
+}
+}  // namespace audio
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/detect_pitch_frequency_ir.h
+++ b/mindspore/ccsrc/minddata/dataset/audio/ir/kernels/detect_pitch_frequency_ir.h
@ -0,0 +1,60 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_IR_KERNELS_DETECT_PITCH_FREQUENCY_IR_H_
+#define MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_IR_KERNELS_DETECT_PITCH_FREQUENCY_IR_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "include/api/status.h"
+#include "minddata/dataset/include/dataset/constants.h"
+#include "minddata/dataset/include/dataset/transforms.h"
+#include "minddata/dataset/kernels/ir/tensor_operation.h"
+
+namespace mindspore {
+namespace dataset {
+namespace audio {
+constexpr char kDetectPitchFrequencyOperation[] = "DetectPitchFrequency";
+
+class DetectPitchFrequencyOperation : public TensorOperation {
+ public:
+  DetectPitchFrequencyOperation(int32_t sample_rate, float frame_time, int32_t win_length, int32_t freq_low,
+                                int32_t freq_high);
+
+  ~DetectPitchFrequencyOperation() = default;
+
+  std::shared_ptr<TensorOp> Build() override;
+
+  Status ValidateParams() override;
+
+  std::string Name() const override { return kDetectPitchFrequencyOperation; }
+
+  Status to_json(nlohmann::json *out_json) override;
+
+ private:
+  int32_t sample_rate_;
+  float frame_time_;
+  int32_t win_length_;
+  int32_t freq_low_;
+  int32_t freq_high_;
+};
+}  // namespace audio
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_IR_KERNELS_DETECT_PITCH_FREQUENCY_IR_H_
--- a/mindspore/ccsrc/minddata/dataset/audio/kernels/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/audio/kernels/CMakeLists.txt
@ -15,6 +15,7 @@ add_library(audio-kernels OBJECT
        contrast_op.cc
        dc_shift_op.cc
        deemph_biquad_op.cc
+        detect_pitch_frequency_op.cc
        equalizer_biquad_op.cc
        fade_op.cc
        frequency_masking_op.cc
--- a/mindspore/ccsrc/minddata/dataset/audio/kernels/audio_utils.cc
+++ b/mindspore/ccsrc/minddata/dataset/audio/kernels/audio_utils.cc
@ -650,5 +650,79 @@ Status Magphase(const TensorRow &input, TensorRow *output, float power) {

  return Status::OK();
 }
+
+Status MedianSmoothing(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t win_length) {
+  auto channel = input->shape()[0];
+  auto num_of_frames = input->shape()[1];
+  // Centered windowed
+  int32_t pad_length = (win_length - 1) / 2;
+  int32_t out_length = num_of_frames + pad_length - win_length + 1;
+  TensorShape out_shape({channel, out_length});
+  std::vector<int> signal;
+  std::vector<int> out;
+  std::vector<int> indices(channel * (num_of_frames + pad_length), 0);
+  // "replicate" padding in any dimension
+  for (auto itr = input->begin<int>(); itr != input->end<int>(); ++itr) {
+    signal.push_back(*itr);
+  }
+  for (int i = 0; i < channel; ++i) {
+    for (int j = 0; j < pad_length; ++j) {
+      indices[i * (num_of_frames + pad_length) + j] = signal[i * num_of_frames];
+    }
+  }
+  for (int i = 0; i < channel; ++i) {
+    for (int j = 0; j < num_of_frames; ++j) {
+      indices[i * (num_of_frames + pad_length) + j + pad_length] = signal[i * num_of_frames + j];
+    }
+  }
+  for (int i = 0; i < channel; ++i) {
+    int32_t index = i * (num_of_frames + pad_length);
+    for (int j = 0; j < out_length; ++j) {
+      std::vector<int> tem(indices.begin() + index, indices.begin() + win_length + index);
+      std::sort(tem.begin(), tem.end());
+      out.push_back(tem[pad_length]);
+      ++index;
+    }
+  }
+  RETURN_IF_NOT_OK(Tensor::CreateFromVector(out, out_shape, output));
+  return Status::OK();
+}
+
+Status DetectPitchFrequency(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate,
+                            float frame_time, int32_t win_length, int32_t freq_low, int32_t freq_high) {
+  std::shared_ptr<Tensor> nccf;
+  std::shared_ptr<Tensor> indices;
+  std::shared_ptr<Tensor> smooth_indices;
+  // pack batch
+  TensorShape input_shape = input->shape();
+  TensorShape to_shape({input->Size() / input_shape[-1], input_shape[-1]});
+  RETURN_IF_NOT_OK(input->Reshape(to_shape));
+  if (input->type() == DataType(DataType::DE_FLOAT32)) {
+    RETURN_IF_NOT_OK(ComputeNccf<float>(input, &nccf, sample_rate, frame_time, freq_low));
+    RETURN_IF_NOT_OK(FindMaxPerFrame<float>(nccf, &indices, sample_rate, freq_high));
+  } else if (input->type() == DataType(DataType::DE_FLOAT64)) {
+    RETURN_IF_NOT_OK(ComputeNccf<double>(input, &nccf, sample_rate, frame_time, freq_low));
+    RETURN_IF_NOT_OK(FindMaxPerFrame<double>(nccf, &indices, sample_rate, freq_high));
+  } else {
+    RETURN_IF_NOT_OK(ComputeNccf<float16>(input, &nccf, sample_rate, frame_time, freq_low));
+    RETURN_IF_NOT_OK(FindMaxPerFrame<float16>(nccf, &indices, sample_rate, freq_high));
+  }
+  RETURN_IF_NOT_OK(MedianSmoothing(indices, &smooth_indices, win_length));
+
+  // Convert indices to frequency
+  constexpr double EPSILON = 1e-9;
+  TensorShape freq_shape = smooth_indices->shape();
+  std::vector<float> out;
+  for (auto itr_fre = smooth_indices->begin<int>(); itr_fre != smooth_indices->end<int>(); ++itr_fre) {
+    out.push_back(sample_rate / (EPSILON + *itr_fre));
+  }
+
+  // unpack batch
+  auto shape_vec = input_shape.AsVector();
+  shape_vec[shape_vec.size() - 1] = freq_shape[-1];
+  TensorShape out_shape(shape_vec);
+  RETURN_IF_NOT_OK(Tensor::CreateFromVector(out, out_shape, output));
+  return Status::OK();
+}
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/audio/kernels/audio_utils.h
+++ b/mindspore/ccsrc/minddata/dataset/audio/kernels/audio_utils.h
@ -372,6 +372,191 @@ Status Vol(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output
 /// \param power: Power of the norm.
 Status Magphase(const TensorRow &input, TensorRow *output, float power);

+/// \brief Compute Normalized Cross-Correlation Function (NCCF).
+/// \param input: Tensor of shape <channel,waveform_length>.
+/// \param output: Tensor of shape <channel, num_of_frames, lags>.
+/// \param sample_rate: The sample rate of the waveform (Hz).
+/// \param frame_time: Duration of a frame.
+/// \param freq_low: Lowest frequency that can be detected (Hz).
+/// \return Status code.
+template <typename T>
+Status ComputeNccf(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate,
+                   float frame_time, int32_t freq_low) {
+  auto channel = input->shape()[0];
+  auto waveform_length = input->shape()[1];
+  size_t idx = 0;
+  size_t channel_idx = 1;
+  int32_t lags = static_cast<int32_t>(ceil(static_cast<float>(sample_rate) / freq_low));
+  int32_t frame_size = static_cast<int32_t>(ceil(sample_rate * frame_time));
+  int32_t num_of_frames = static_cast<int32_t>(ceil(static_cast<float>(waveform_length) / frame_size));
+  int32_t p = lags + num_of_frames * frame_size - waveform_length;
+  TensorShape output_shape({channel, num_of_frames, lags});
+  DataType intput_type = input->type();
+  RETURN_IF_NOT_OK(Tensor::CreateEmpty(output_shape, intput_type, output));
+  // pad p 0 in -1 dimension
+  std::vector<T> signal;
+  // Tensor -> vector
+  for (auto itr = input->begin<T>(); itr != input->end<T>();) {
+    while (idx < waveform_length * channel_idx) {
+      signal.push_back(*itr);
+      ++itr;
+      ++idx;
+    }
+    // Each channel is processed with the sliding window
+    // waveform：[channel, time] -->  waveform：[channel, time+p]
+    for (size_t i = 0; i < p; ++i) {
+      signal.push_back(static_cast<T>(0.0));
+    }
+    if (idx % waveform_length == 0) {
+      ++channel_idx;
+    }
+  }
+  // compute ncc
+  for (dsize_t lag = 1; lag <= lags; ++lag) {
+    // compute one ncc
+    // one ncc out
+    std::vector<T> out;
+    channel_idx = 1;
+    idx = 0;
+    size_t win_idx = 0;
+    size_t waveform_length_p = waveform_length + p;
+    // Traversal signal
+    for (auto itr = signal.begin(); itr != signal.end();) {
+      // Each channel is processed with the sliding window
+      size_t s1 = idx;
+      size_t s2 = idx + lag;
+      size_t frame_count = 0;
+      T s1_norm = static_cast<T>(0);
+      T s2_norm = static_cast<T>(0);
+      T ncc_umerator = static_cast<T>(0);
+      T ncc = static_cast<T>(0);
+      while (idx < waveform_length_p * channel_idx) {
+        // Sliding window
+        if (frame_count == num_of_frames) {
+          ++itr;
+          ++idx;
+          continue;
+        }
+        if (win_idx < frame_size) {
+          ncc_umerator += signal[s1] * signal[s2];
+          s1_norm += signal[s1] * signal[s1];
+          s2_norm += signal[s2] * signal[s2];
+          ++win_idx;
+          ++s1;
+          ++s2;
+        }
+        if (win_idx == frame_size) {
+          if (s1_norm != static_cast<T>(0.0) && s2_norm != static_cast<T>(0.0)) {
+            ncc = ncc_umerator / s1_norm / s2_norm;
+          } else {
+            ncc = static_cast<T>(0.0);
+          }
+          out.push_back(ncc);
+          ncc_umerator = static_cast<T>(0.0);
+          s1_norm = static_cast<T>(0.0);
+          s2_norm = static_cast<T>(0.0);
+          ++frame_count;
+          win_idx = 0;
+        }
+        ++itr;
+        ++idx;
+      }
+      if (idx % waveform_length_p == 0) {
+        ++channel_idx;
+      }
+    }  // compute one ncc
+    // cat tensor
+    auto itr_out = out.begin();
+    for (dsize_t row_idx = 0; row_idx < channel; ++row_idx) {
+      for (dsize_t frame_idx = 0; frame_idx < num_of_frames; ++frame_idx) {
+        RETURN_IF_NOT_OK((*output)->SetItemAt({row_idx, frame_idx, lag - 1}, *itr_out));
+        ++itr_out;
+      }
+    }
+  }  // compute ncc
+  return Status::OK();
+}
+
+/// \brief For each frame, take the highest value of NCCF.
+/// \param input: Tensor of shape <channel, num_of_frames, lags>.
+/// \param output: Tensor of shape <channel, num_of_frames>.
+/// \param sample_rate: The sample rate of the waveform (Hz).
+/// \param freq_high: Highest frequency that can be detected (Hz).
+/// \return Status code.
+template <typename T>
+Status FindMaxPerFrame(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate,
+                       int32_t freq_high) {
+  std::vector<T> signal;
+  std::vector<int> out;
+  auto channel = input->shape()[0];
+  auto num_of_frames = input->shape()[1];
+  auto lags = input->shape()[2];
+  int32_t lag_min = static_cast<int32_t>(ceil(static_cast<float>(sample_rate) / freq_high));
+  TensorShape out_shape({channel, num_of_frames});
+  // pack batch
+  for (auto itr = input->begin<T>(); itr != input->end<T>(); ++itr) {
+    signal.push_back(*itr);
+  }
+  // find the best nccf
+  T best_max_value = static_cast<T>(0.0);
+  T half_max_value = static_cast<T>(0.0);
+  int32_t best_max_indices = 0;
+  int32_t half_max_indices = 0;
+  auto thresh = static_cast<T>(0.99);
+  auto lags_half = lags / 2;
+  for (dsize_t channel_idx = 0; channel_idx < channel; ++channel_idx) {
+    for (dsize_t frame_idx = 0; frame_idx < num_of_frames; ++frame_idx) {
+      auto index_01 = channel_idx * num_of_frames * lags + frame_idx * lags + lag_min;
+      best_max_value = signal[index_01];
+      half_max_value = signal[index_01];
+      best_max_indices = lag_min;
+      half_max_indices = lag_min;
+      for (dsize_t lag_idx = 0; lag_idx < lags; ++lag_idx) {
+        if (lag_idx > lag_min) {
+          auto index_02 = channel_idx * num_of_frames * lags + frame_idx * lags + lag_idx;
+          if (signal[index_02] > best_max_value) {
+            best_max_value = signal[index_02];
+            best_max_indices = lag_idx;
+            if (lag_idx < lags_half) {
+              half_max_value = signal[index_02];
+              half_max_indices = lag_idx;
+            }
+          }
+        }
+      }
+      // Add back minimal lag
+      // Add 1 empirical calibration offset
+      if (half_max_value > best_max_value * thresh) {
+        out.push_back(half_max_indices + 1);
+      } else {
+        out.push_back(best_max_indices + 1);
+      }
+    }
+  }
+  // unpack batch
+  RETURN_IF_NOT_OK(Tensor::CreateFromVector(out, out_shape, output));
+  return Status::OK();
+}
+
+/// \brief Apply median smoothing to the 1D tensor over the given window.
+/// \param input: Tensor of shape<channel, num_of_frames>.
+/// \param output: Tensor of shape <channel, num_of_window>.
+/// \param win_length: The window length for median smoothing (in number of frames).
+/// \return Status code.
+Status MedianSmoothing(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t win_length);
+
+/// \brief Detect pitch frequency.
+/// \param input: Tensor of shape <channel,waveform_length>.
+/// \param output: Tensor of shape <channel, num_of_frames, lags>.
+/// \param sample_rate: The sample rate of the waveform (Hz).
+/// \param frame_time: Duration of a frame.
+/// \param win_length: The window length for median smoothing (in number of frames).
+/// \param freq_low: Lowest frequency that can be detected (Hz).
+/// \param freq_high: Highest frequency that can be detected (Hz).
+/// \return Status code.
+Status DetectPitchFrequency(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate,
+                            float frame_time, int32_t win_length, int32_t freq_low, int32_t freq_high);
+
 }  // namespace dataset
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_AUDIO_UTILS_H_
--- a/mindspore/ccsrc/minddata/dataset/audio/kernels/detect_pitch_frequency_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/audio/kernels/detect_pitch_frequency_op.cc
@ -0,0 +1,37 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "minddata/dataset/audio/kernels/detect_pitch_frequency_op.h"
+
+#include "minddata/dataset/audio/kernels/audio_utils.h"
+#include "minddata/dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+Status DetectPitchFrequencyOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  TensorShape input_shape = input->shape();
+  // check input tensor dimension, it should be greater than 0.
+  CHECK_FAIL_RETURN_UNEXPECTED(input_shape.Size() > 0,
+                               "DetectPitchFrequency: input tensor is not in shape of <..., time>.");
+  // check input type, it should be DE_FLOAT16, DE_FLOAT32 or DE_FLOAT64
+  CHECK_FAIL_RETURN_UNEXPECTED(
+    input->type() == DataType(DataType::DE_FLOAT32) || input->type() == DataType(DataType::DE_FLOAT64) ||
+      input->type() == DataType(DataType::DE_FLOAT16),
+    "DetectPitchFrequency: input tensor type should be float or double, but got: " + input->type().ToString());
+  return DetectPitchFrequency(input, output, sample_rate_, frame_time_, win_length_, freq_low_, freq_high_);
+}
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/audio/kernels/detect_pitch_frequency_op.h
+++ b/mindspore/ccsrc/minddata/dataset/audio/kernels/detect_pitch_frequency_op.h
@ -0,0 +1,58 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_DETECT_PITCH_FREQUENCY_OP_H_
+#define MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_DETECT_PITCH_FREQUENCY_OP_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "minddata/dataset/core/tensor.h"
+#include "minddata/dataset/kernels/tensor_op.h"
+#include "minddata/dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+class DetectPitchFrequencyOp : public TensorOp {
+ public:
+  DetectPitchFrequencyOp(int32_t sample_rate, float frame_time, int32_t win_length, int32_t freq_low, int32_t freq_high)
+      : sample_rate_(sample_rate),
+        frame_time_(frame_time),
+        win_length_(win_length),
+        freq_low_(freq_low),
+        freq_high_(freq_high) {}
+
+  ~DetectPitchFrequencyOp() override = default;
+
+  void Print(std::ostream &out) const override {
+    out << Name() << ": sample_rate: " << sample_rate_ << ", frame_time: " << frame_time_
+        << ", win_length: " << win_length_ << ", freq_low: " << freq_low_ << ", freq_high: " << freq_high_ << std::endl;
+  }
+
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+
+  std::string Name() const override { return kDetectPitchFrequencyOp; }
+
+ private:
+  int32_t sample_rate_;
+  float frame_time_;
+  int32_t win_length_;
+  int32_t freq_low_;
+  int32_t freq_high_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_DETECT_PITCH_FREQUENCY_OP_H_
--- a/mindspore/ccsrc/minddata/dataset/include/dataset/audio.h
+++ b/mindspore/ccsrc/minddata/dataset/include/dataset/audio.h
@ -300,6 +300,33 @@ class DeemphBiquad final : public TensorTransform {
  std::shared_ptr<Data> data_;
 };

+/// \brief Detect pitch frequency.
+class DetectPitchFrequency final : public TensorTransform {
+ public:
+  /// \brief Constructor.
+  /// \param[in] sample_rate Sampling rate of the waveform, e.g. 44100 (Hz), the value can't be zero.
+  /// \param[in] frame_time Duration of a frame, the value must be greater than zero (default=0.02).
+  /// \param[in] win_length The window length for median smoothing (in number of frames), the value must
+  ///     be greater than zero (default=30).
+  /// \param[in] freq_low Lowest frequency that can be detected (Hz), the value must be greater than zero (default=85).
+  /// \param[in] freq_high Highest frequency that can be detected (Hz), the value must be greater than
+  ///     zero (default=3400).
+  explicit DetectPitchFrequency(int32_t sample_rate, float frame_time = 0.01, int32_t win_length = 30,
+                                int32_t freq_low = 85, int32_t freq_high = 3400);
+
+  /// \brief Destructor.
+  ~DetectPitchFrequency() = default;
+
+ protected:
+  /// \brief Function to convert TensorTransform object into a TensorOperation object.
+  /// \return Shared pointer to TensorOperation object.
+  std::shared_ptr<TensorOperation> Parse() override;
+
+ private:
+  struct Data;
+  std::shared_ptr<Data> data_;
+};
+
 /// \brief EqualizerBiquad TensorTransform. Apply highpass biquad filter on audio.
 class EqualizerBiquad final : public TensorTransform {
 public:
--- a/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h
@ -155,6 +155,7 @@ constexpr char kComplexNormOp[] = "ComplexNormOp";
 constexpr char kContrastOp[] = "ContrastOp";
 constexpr char kDCShiftOp[] = "DCShiftOp";
 constexpr char kDeemphBiquadOp[] = "DeemphBiquadOp";
+constexpr char kDetectPitchFrequencyOp[] = "DetectPitchFrequencyOp";
 constexpr char kEqualizerBiquadOp[] = "EqualizerBiquadOp";
 constexpr char kFadeOp[] = "FadeOp";
 constexpr char kFrequencyMaskingOp[] = "FrequencyMaskingOp";
--- a/mindspore/dataset/audio/transforms.py
+++ b/mindspore/dataset/audio/transforms.py
@ -26,8 +26,8 @@ from ..transforms.c_transforms import TensorOperation
 from .utils import FadeShape, GainType, ScaleType
 from .validators import check_allpass_biquad, check_amplitude_to_db, check_band_biquad, check_bandpass_biquad, \
    check_bandreject_biquad, check_bass_biquad, check_biquad, check_complex_norm, check_contrast, check_dc_shift, \
-    check_deemph_biquad, check_equalizer_biquad, check_fade, check_highpass_biquad, check_lfilter, \
-    check_lowpass_biquad, check_magphase, check_masking, check_mu_law_decoding, check_riaa_biquad, \
+    check_deemph_biquad, check_detect_pitch_frequency, check_equalizer_biquad, check_fade, check_highpass_biquad, \
+    check_lfilter, check_lowpass_biquad, check_magphase, check_masking, check_mu_law_decoding, check_riaa_biquad, \
    check_time_stretch, check_treble_biquad, check_vol


@ -379,6 +379,45 @@ class DeemphBiquad(AudioTensorOperation):
        return cde.DeemphBiquadOperation(self.sample_rate)


+class DetectPitchFrequency(AudioTensorOperation):
+    """
+    Detect pitch frequency.
+
+    It is implemented using normalized cross-correlation function and median smoothing.
+
+    Args:
+        sample_rate (int): Sampling rate of the waveform, e.g. 44100 (Hz), the value can't be zero.
+        frame_time (float, optional): Duration of a frame, the value must be greater than zero (default=0.01).
+        win_length (int, optional): The window length for median smoothing (in number of frames), the value must be
+            greater than zero (default=30).
+        freq_low (int, optional): Lowest frequency that can be detected (Hz), the value must be greater than zero
+            (default=85).
+        freq_high (int, optional): Highest frequency that can be detected (Hz), the value must be greater than zero
+            (default=3400).
+
+    Examples:
+        >>> import numpy as np
+        >>>
+        >>> waveform = np.array([[0.716064e-03, 5.347656e-03, 6.246826e-03, 2.089477e-02, 7.138305e-02],
+        ...                      [4.156616e-02, 1.394653e-02, 3.550292e-02, 0.614379e-02, 3.840209e-02]])
+        >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
+        >>> transforms = [audio.DetectPitchFrequency(30, 0.1, 3, 5, 25)]
+        >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
+    """
+
+    @check_detect_pitch_frequency
+    def __init__(self, sample_rate, frame_time=0.01, win_length=30, freq_low=85, freq_high=3400):
+        self.sample_rate = sample_rate
+        self.frame_time = frame_time
+        self.win_length = win_length
+        self.freq_low = freq_low
+        self.freq_high = freq_high
+
+    def parse(self):
+        return cde.DetectPitchFrequencyOperation(self.sample_rate, self.frame_time,
+                                                 self.win_length, self.freq_low, self.freq_high)
+
+
 class EqualizerBiquad(AudioTensorOperation):
    """
    Design biquad equalizer filter and perform filtering. Similar to SoX implementation.
--- a/mindspore/dataset/audio/validators.py
+++ b/mindspore/dataset/audio/validators.py
@ -453,3 +453,25 @@ def check_vol(method):
        return method(self, *args, **kwargs)

    return new_method
+
+
+def check_detect_pitch_frequency(method):
+    """Wrapper method to check the parameters of DetectPitchFrequency."""
+
+    @wraps(method)
+    def new_method(self, *args, **kwargs):
+        [sample_rate, frame_time, win_length, freq_low, freq_high], _ = parse_user_args(
+            method, *args, **kwargs)
+        type_check(sample_rate, (int,), "sample_rate")
+        check_int32_not_zero(sample_rate, "sample_rate")
+        type_check(frame_time, (float, int), "frame_time")
+        check_pos_float32(frame_time, "frame_time")
+        type_check(win_length, (int,), "win_length")
+        check_pos_int32(win_length, "win_length")
+        type_check(freq_low, (int, float), "freq_low")
+        check_pos_float32(freq_low, "freq_low")
+        type_check(freq_high, (int, float), "freq_high")
+        check_pos_float32(freq_high, "freq_high")
+        return method(self, *args, **kwargs)
+
+    return new_method
--- a/tests/ut/cpp/dataset/c_api_audio_a_to_q_test.cc
+++ b/tests/ut/cpp/dataset/c_api_audio_a_to_q_test.cc
@ -1375,7 +1375,7 @@ TEST_F(MindDataTestPipeline, TestMagphaseWrongArgs) {
  std::shared_ptr<TensorTransform> magphase(new audio::Magphase(power_wrong));
  std::unordered_map<std::string, mindspore::MSTensor> row;

-  //Magphase: power must be greater than or equal to 0.
+  // Magphase: power must be greater than or equal to 0.
  std::shared_ptr<SchemaObj> schema = Schema();
  ASSERT_OK(schema->add_column("col1", mindspore::DataType::kNumberTypeFloat32, {2, 2}));
  std::shared_ptr<Dataset> ds = RandomData(8, schema);
@ -1385,3 +1385,100 @@ TEST_F(MindDataTestPipeline, TestMagphaseWrongArgs) {
  std::shared_ptr<Iterator> iter = ds->CreateIterator();
  EXPECT_EQ(iter, nullptr);
 }
+
+TEST_F(MindDataTestPipeline, TestDetectPitchFrequencyBasic) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestDetectPitchFrequencyBasic.";
+  // Original waveform
+  std::shared_ptr<SchemaObj> schema = Schema();
+  ASSERT_OK(schema->add_column("waveform", mindspore::DataType::kNumberTypeFloat32, {2, 10000}));
+  std::shared_ptr<Dataset> ds = RandomData(50, schema);
+  EXPECT_NE(ds, nullptr);
+
+  ds = ds->SetNumWorkers(4);
+  EXPECT_NE(ds, nullptr);
+
+  auto DetectPitchFrequencyOp = audio::DetectPitchFrequency(44100);
+
+  ds = ds->Map({DetectPitchFrequencyOp});
+  EXPECT_NE(ds, nullptr);
+
+  // Detect pitch frequency
+  std::shared_ptr<Iterator> iter = ds->CreateIterator();
+  EXPECT_NE(ds, nullptr);
+
+  std::unordered_map<std::string, mindspore::MSTensor> row;
+  ASSERT_OK(iter->GetNextRow(&row));
+
+  std::vector<int64_t> expected = {2, 8};
+
+  int i = 0;
+  while (row.size() != 0) {
+    auto col = row["waveform"];
+    ASSERT_EQ(col.Shape(), expected);
+    ASSERT_EQ(col.Shape().size(), 2);
+    ASSERT_EQ(col.DataType(), mindspore::DataType::kNumberTypeFloat32);
+    ASSERT_OK(iter->GetNextRow(&row));
+    i++;
+  }
+  EXPECT_EQ(i, 50);
+  iter->Stop();
+}
+
+TEST_F(MindDataTestPipeline, TestDetectPitchFrequencyParamCheck) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestDetectPitchFrequencyParamCheck.";
+  std::shared_ptr<SchemaObj> schema = Schema();
+  // Original waveform
+  ASSERT_OK(schema->add_column("waveform", mindspore::DataType::kNumberTypeFloat32, {2, 2}));
+  std::shared_ptr<Dataset> ds = RandomData(50, schema);
+  std::shared_ptr<Dataset> ds01;
+  std::shared_ptr<Dataset> ds02;
+  std::shared_ptr<Dataset> ds03;
+  std::shared_ptr<Dataset> ds04;
+  std::shared_ptr<Dataset> ds05;
+  EXPECT_NE(ds, nullptr);
+
+  // Check sample_rate
+  MS_LOG(INFO) << "sample_rate is zero.";
+  auto detect_pitch_frequency_op_01 = audio::DetectPitchFrequency(0);
+  ds01 = ds->Map({detect_pitch_frequency_op_01});
+  EXPECT_NE(ds01, nullptr);
+
+  std::shared_ptr<Iterator> iter01 = ds01->CreateIterator();
+  EXPECT_EQ(iter01, nullptr);
+
+  // Check frame_time
+  MS_LOG(INFO) << "frame_time is zero.";
+  auto detect_pitch_frequency_op_02 = audio::DetectPitchFrequency(30, 0);
+  ds02 = ds->Map({detect_pitch_frequency_op_02});
+  EXPECT_NE(ds02, nullptr);
+
+  std::shared_ptr<Iterator> iter02 = ds02->CreateIterator();
+  EXPECT_EQ(iter02, nullptr);
+
+  // Check win_length
+  MS_LOG(INFO) << "win_length is zero.";
+  auto detect_pitch_frequency_op_03 = audio::DetectPitchFrequency(30, 0.1, 0);
+  ds03 = ds->Map({detect_pitch_frequency_op_03});
+  EXPECT_NE(ds03, nullptr);
+
+  std::shared_ptr<Iterator> iter03 = ds03->CreateIterator();
+  EXPECT_EQ(iter03, nullptr);
+
+  // Check freq_low
+  MS_LOG(INFO) << "freq_low is zero.";
+  auto detect_pitch_frequency_op_04 = audio::DetectPitchFrequency(30, 0.1, 3, 0);
+  ds04 = ds->Map({detect_pitch_frequency_op_04});
+  EXPECT_NE(ds04, nullptr);
+
+  std::shared_ptr<Iterator> iter04 = ds04->CreateIterator();
+  EXPECT_EQ(iter04, nullptr);
+
+  // Check freq_high
+  MS_LOG(INFO) << "freq_high is zero.";
+  auto detect_pitch_frequency_op_05 = audio::DetectPitchFrequency(30, 0.1, 3, 5, 0);
+  ds05 = ds->Map({detect_pitch_frequency_op_05});
+  EXPECT_NE(ds05, nullptr);
+
+  std::shared_ptr<Iterator> iter05 = ds05->CreateIterator();
+  EXPECT_EQ(iter05, nullptr);
+}
--- a/tests/ut/cpp/dataset/execute_test.cc
+++ b/tests/ut/cpp/dataset/execute_test.cc
@ -897,12 +897,8 @@ TEST_F(MindDataTestExecute, TestRiaaBiquadWithEager) {

 TEST_F(MindDataTestExecute, TestRiaaBiquadWithWrongArg) {
  MS_LOG(INFO) << "Doing MindDataTestExecute-TestRiaaBiquadWithWrongArg.";
-  std::vector<float> labels = {
-    3.156, 5.690, 1.362, 1.093,
-    5.782, 6.381, 5.982, 3.098,
-    1.222, 6.027, 3.909, 7.993,
-    4.324, 1.092, 5.093, 0.991,
-    1.099, 4.092, 8.111, 6.666};
+  std::vector<float> labels = {3.156, 5.690, 1.362, 1.093, 5.782, 6.381, 5.982, 3.098, 1.222, 6.027,
+                               3.909, 7.993, 4.324, 1.092, 5.093, 0.991, 1.099, 4.092, 8.111, 6.666};
  std::shared_ptr<Tensor> input;
  ASSERT_OK(Tensor::CreateFromVector(labels, TensorShape({4, 5}), &input));
  auto input01 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(input));
@ -917,12 +913,8 @@ TEST_F(MindDataTestExecute, TestRiaaBiquadWithWrongArg) {
 TEST_F(MindDataTestExecute, TestTrebleBiquadWithEager) {
  MS_LOG(INFO) << "Doing MindDataTestExecute-TestTrebleBiquadWithEager.";
  // Original waveform
-  std::vector<float> labels = {
-    3.156, 5.690, 1.362, 1.093,
-    5.782, 6.381, 5.982, 3.098,
-    1.222, 6.027, 3.909, 7.993,
-    4.324, 1.092, 5.093, 0.991,
-    1.099, 4.092, 8.111, 6.666};
+  std::vector<float> labels = {3.156, 5.690, 1.362, 1.093, 5.782, 6.381, 5.982, 3.098, 1.222, 6.027,
+                               3.909, 7.993, 4.324, 1.092, 5.093, 0.991, 1.099, 4.092, 8.111, 6.666};
  std::shared_ptr<Tensor> input;
  ASSERT_OK(Tensor::CreateFromVector(labels, TensorShape({2, 10}), &input));
  auto input_01 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(input));
@ -949,9 +941,10 @@ TEST_F(MindDataTestExecute, TestTrebleBiquadWithWrongArg) {
  std::shared_ptr<TensorTransform> treble_biquad_op01 = std::make_shared<audio::TrebleBiquad>(0.0, 200.0);
  mindspore::dataset::Execute Transform01({treble_biquad_op01});
  EXPECT_ERROR(Transform01(input01, &input01));
-  //Check Q
+  // Check Q
  MS_LOG(INFO) << "Q is zero.";
-  std::shared_ptr<TensorTransform> treble_biquad_op02 = std::make_shared<audio::TrebleBiquad>(44100, 200.0, 3000.0, 0.0);
+  std::shared_ptr<TensorTransform> treble_biquad_op02 =
+    std::make_shared<audio::TrebleBiquad>(44100, 200.0, 3000.0, 0.0);
  mindspore::dataset::Execute Transform02({treble_biquad_op02});
  EXPECT_ERROR(Transform02(input02, &input02));
 }
@ -1169,8 +1162,7 @@ TEST_F(MindDataTestExecute, TestMagphaseEager) {
  float power = 1.0;
  std::vector<mindspore::MSTensor> output_tensor;
  std::shared_ptr<Tensor> test;
-  std::vector<float> test_vector = {3, 4, -3, 4, 3, -4, -3, -4,
-                                    5, 12, -5, 12, 5, -12, -5, -12};
+  std::vector<float> test_vector = {3, 4, -3, 4, 3, -4, -3, -4, 5, 12, -5, 12, 5, -12, -5, -12};
  Tensor::CreateFromVector(test_vector, TensorShape({2, 4, 2}), &test);
  auto input_tensor = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(test));
  std::shared_ptr<TensorTransform> magphase(new audio::Magphase({power}));
@ -1234,3 +1226,68 @@ TEST_F(MindDataTestExecute, TestRandomAdjustSharpnessEager) {
  Status rc = transform(image, &image);
  EXPECT_EQ(rc, Status::OK());
 }
+
+TEST_F(MindDataTestExecute, TestDetectPitchFrequencyWithEager) {
+  MS_LOG(INFO) << "Doing MindDataTestExecute-TestDetectPitchFrequencyWithEager.";
+  // Original waveform
+  std::vector<double> labels = {
+    3.716064453125000000e-03, 2.347656250000000000e-03, 9.246826171875000000e-03, 4.089477539062500000e-02,
+    3.138305664062500000e-02, 1.156616210937500000e-02, 0.394653320312500000e-02, 1.550292968750000000e-02,
+    1.614379882812500000e-02, 0.840209960937500000e-02, 1.718139648437500000e-02, 2.599121093750000000e-02,
+    5.647949218750000000e-02, 1.510620117187500000e-02, 2.385498046875000000e-02, 1.345825195312500000e-02,
+    1.419067382812500000e-02, 3.284790039062500000e-02, 9.052856445312500000e-02, 2.368896484375000000e-03};
+  std::shared_ptr<Tensor> input;
+  ASSERT_OK(Tensor::CreateFromVector(labels, TensorShape({2, 10}), &input));
+  auto input_02 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(input));
+  std::shared_ptr<TensorTransform> detect_pitch_frequency_01 =
+    std::make_shared<audio::DetectPitchFrequency>(30, 0.1, 3, 5, 25);
+  mindspore::dataset::Execute Transform01({detect_pitch_frequency_01});
+  // Detect pitch frequence
+  Status s01 = Transform01(input_02, &input_02);
+  EXPECT_TRUE(s01.IsOk());
+}
+
+TEST_F(MindDataTestExecute, TestDetectPitchFrequencyWithWrongArg) {
+  MS_LOG(INFO) << "Doing MindDataTestExecute-TestDetectPitchFrequencyWithWrongArg.";
+  std::vector<float> labels = {
+    0.716064e-03, 5.347656e-03, 6.246826e-03, 2.089477e-02, 7.138305e-02,
+    4.156616e-02, 1.394653e-02, 3.550292e-02, 0.614379e-02, 3.840209e-02,
+  };
+  std::shared_ptr<Tensor> input;
+  ASSERT_OK(Tensor::CreateFromVector(labels, TensorShape({2, 5}), &input));
+  auto input_02 = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(input));
+  // Check frame_time
+  MS_LOG(INFO) << "frame_time is zero.";
+  std::shared_ptr<TensorTransform> detect_pitch_frequency_01 =
+    std::make_shared<audio::DetectPitchFrequency>(40, 0, 3, 3, 20);
+  mindspore::dataset::Execute Transform01({detect_pitch_frequency_01});
+  Status s01 = Transform01(input_02, &input_02);
+  EXPECT_FALSE(s01.IsOk());
+  // Check win_length
+  MS_LOG(INFO) << "win_length is zero.";
+  std::shared_ptr<TensorTransform> detect_pitch_frequency_02 =
+    std::make_shared<audio::DetectPitchFrequency>(40, 0.1, 0, 3, 20);
+  mindspore::dataset::Execute Transform02({detect_pitch_frequency_02});
+  Status s02 = Transform02(input_02, &input_02);
+  EXPECT_FALSE(s02.IsOk());
+  // Check freq_low
+  MS_LOG(INFO) << "freq_low is zero.";
+  std::shared_ptr<TensorTransform> detect_pitch_frequency_03 =
+    std::make_shared<audio::DetectPitchFrequency>(40, 0.1, 3, 0, 20);
+  mindspore::dataset::Execute Transform03({detect_pitch_frequency_03});
+  Status s03 = Transform03(input_02, &input_02);
+  EXPECT_FALSE(s03.IsOk());
+  // Check freq_high
+  MS_LOG(INFO) << "freq_high is zero.";
+  std::shared_ptr<TensorTransform> detect_pitch_frequency_04 =
+    std::make_shared<audio::DetectPitchFrequency>(40, 0.1, 3, 3, 0);
+  mindspore::dataset::Execute Transform04({detect_pitch_frequency_04});
+  Status s04 = Transform04(input_02, &input_02);
+  EXPECT_FALSE(s04.IsOk());
+  // Check sample_rate
+  MS_LOG(INFO) << "sample_rate is zero.";
+  std::shared_ptr<TensorTransform> detect_pitch_frequency_05 = std::make_shared<audio::DetectPitchFrequency>(0);
+  mindspore::dataset::Execute Transform05({detect_pitch_frequency_05});
+  Status s05 = Transform05(input_02, &input_02);
+  EXPECT_FALSE(s05.IsOk());
+}
--- a/tests/ut/python/dataset/test_detect_pitch_frequency.py
+++ b/tests/ut/python/dataset/test_detect_pitch_frequency.py
@ -0,0 +1,114 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import numpy as np
+import pytest
+
+import mindspore.dataset as ds
+import mindspore.dataset.audio.transforms as audio
+from mindspore import log as logger
+
+
+def count_unequal_element(data_expected, data_me, rtol, atol):
+    assert data_expected.shape == data_me.shape
+    total_count = len(data_expected.flatten())
+    error = np.abs(data_expected - data_me)
+    greater = np.greater(error, atol + np.abs(data_expected) * rtol)
+    loss_count = np.count_nonzero(greater)
+    assert (loss_count / total_count) < rtol, \
+        "\ndata_expected_std:{0}\ndata_me_error:{1}\nloss:{2}". \
+        format(data_expected[greater], data_me[greater], error[greater])
+
+
+def test_detect_pitch_frequency_eager():
+    """ mindspore eager mode normal testcase:detect_pitch_frequency op"""
+    # Original waveform
+    waveform = np.array([[2.716064453125e-03, 6.34765625e-03, 9.246826171875e-03, 1.0894775390625e-02,
+                          1.1383056640625e-02, 1.1566162109375e-02, 1.3946533203125e-02, 1.55029296875e-02,
+                          1.6143798828125e-02, 1.8402099609375e-02],
+                         [1.7181396484375e-02, 1.59912109375e-02, 1.64794921875e-02, 1.5106201171875e-02,
+                          1.385498046875e-02, 1.3458251953125e-02, 1.4190673828125e-02, 1.2847900390625e-02,
+                          1.0528564453125e-02, 9.368896484375e-03]], dtype=np.float64)
+    # Expect waveform
+    expect_waveform = np.array(
+        [[10., 10., 10.], [5., 5., 10.]], dtype=np.float64)
+    detect_pitch_frequency_op = audio.DetectPitchFrequency(30, 0.1, 3, 5, 25)
+    # Detect pitch frequence
+    output = detect_pitch_frequency_op(waveform)
+    count_unequal_element(expect_waveform, output, 0.0001, 0.0001)
+
+
+def test_detect_pitch_frequency_pipeline():
+    """ mindspore pipeline mode normal testcase:detect_pitch_frequency op"""
+    # Original waveform
+    waveform = np.array([[0.716064453125e-03, 5.34765625e-03, 6.246826171875e-03, 2.0894775390625e-02,
+                          7.1383056640625e-02], [4.1566162109375e-02, 1.3946533203125e-02, 3.55029296875e-02,
+                                                 0.6143798828125e-02, 3.8402099609375e-02]], dtype=np.float64)
+    # Expect waveform
+    expect_waveform = np.array([[10.0000], [7.5000]], dtype=np.float64)
+    dataset = ds.NumpySlicesDataset(waveform, ["audio"], shuffle=False)
+    detect_pitch_frequency_op = audio.DetectPitchFrequency(30, 0.1, 3, 5, 25)
+    # Detect pitch frequence
+    dataset = dataset.map(input_columns=["audio"],
+                          operations=detect_pitch_frequency_op, num_parallel_workers=8)
+    i = 0
+    for item in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
+        count_unequal_element(expect_waveform[i, :],
+                              item['audio'], 0.0001, 0.0001)
+        i += 1
+
+
+def test_detect_pitch_frequency_invalid_input():
+    def test_invalid_input(test_name, sample_rate, frame_time, win_length, freq_low, freq_high, error, error_msg):
+        logger.info(
+            "Test DetectPitchFrequency with bad input: {0}".format(test_name))
+        with pytest.raises(error) as error_info:
+            audio.DetectPitchFrequency(
+                sample_rate, frame_time, win_length, freq_low, freq_high)
+        assert error_msg in str(error_info.value)
+
+    test_invalid_input("invalid sample_rate parameter type as a float", 44100.5, 0.01, 30, 85, 3400, TypeError,
+                       "Argument sample_rate with value 44100.5 is not of type [<class 'int'>],"
+                       " but got <class 'float'>.")
+    test_invalid_input("invalid sample_rate parameter type as a String", "44100", 0.01, 30, 85, 3400, TypeError,
+                       "Argument sample_rate with value 44100 is not of type [<class 'int'>], but got <class 'str'>.")
+    test_invalid_input("invalid frame_time parameter type as a String", 44100, "0.01", 30, 85, 3400, TypeError,
+                       "Argument frame_time with value 0.01 is not of type [<class 'float'>, <class 'int'>],"
+                       " but got <class 'str'>.")
+    test_invalid_input("invalid win_length parameter type as a float", 44100, 0.01, 30.1, 85, 3400, TypeError,
+                       "Argument win_length with value 30.1 is not of type [<class 'int'>], but got <class 'float'>.")
+    test_invalid_input("invalid win_length parameter type as a String", 44100, 0.01, "30", 85, 3400, TypeError,
+                       "Argument win_length with value 30 is not of type [<class 'int'>], but got <class 'str'>.")
+    test_invalid_input("invalid freq_low parameter type as a String", 44100, 0.01, 30, "85", 3400, TypeError,
+                       "Argument freq_low with value 85 is not of type [<class 'int'>, <class 'float'>],"
+                       " but got <class 'str'>.")
+    test_invalid_input("invalid freq_high parameter type as a String", 44100, 0.01, 30, 85, "3400", TypeError,
+                       "Argument freq_high with value 3400 is not of type [<class 'int'>, <class 'float'>],"
+                       " but got <class 'str'>.")
+    test_invalid_input("invalid sample_rate parameter value", 0, 0.01, 30, 85, 3400, ValueError,
+                       "Input sample_rate is not within the required interval of [-2147483648, 0) and (0, 2147483647].")
+    test_invalid_input("invalid frame_time parameter value", 44100, 0, 30, 85, 3400, ValueError,
+                       "Input frame_time is not within the required interval of (0, 16777216].")
+    test_invalid_input("invalid win_length parameter value", 44100, 0.01, 0, 85, 3400, ValueError,
+                       "Input win_length is not within the required interval of [1, 2147483647].")
+    test_invalid_input("invalid freq_low parameter value", 44100, 0.01, 30, 0, 3400, ValueError,
+                       "Input freq_low is not within the required interval of (0, 16777216].")
+    test_invalid_input("invalid freq_high parameter value", 44100, 0.01, 30, 85, 0, ValueError,
+                       "Input freq_high is not within the required interval of (0, 16777216].")
+
+
+if __name__ == "__main__":
+    test_detect_pitch_frequency_eager()
+    test_detect_pitch_frequency_pipeline()
+    test_detect_pitch_frequency_invalid_input()