[assistant][ops][I5EWI6] Add new data operator MFCC.

This commit is contained in:
panshinanyi 2022-12-08 16:59:22 +08:00
parent d870c9090c
commit ef952d9977
21 changed files with 1183 additions and 4 deletions

View File

@ -0,0 +1,27 @@
mindspore.dataset.audio.MFCC
============================
.. py:class:: mindspore.dataset.audio.MFCC(sample_rate=16000, n_mfcc=40, dct_type=2, norm=NormMode.ORTHO, log_mels=False, melkwargs=None)
计算音频信号的梅尔频率倒谱系数。
参数:
- **sample_rate** (int, 可选) - 采样频率单位Hz不能小于零。默认值16000。
- **n_mfcc** (int, 可选) - 要保留的梅尔频率倒谱系数数不能小于零。默认40。
- **dct_type** (int, 可选) - 要使用的离散余弦变换类型离散余弦变换只能为2。默认2。
- **norm** (NormMode, 可选) - 要使用的标准类型。默认NormMode.ORTHO。
- **log_mels** (bool, 可选) - 是否使用梅尔对数谱图而不是分贝刻度。默认False。
- **melkwargs** (dict, 可选) - 梅尔频谱的参数如果为None则使用默认参数。默认None会被设置为
`{'n_fft': 400, 'win_length': n_fft, 'hop_length': win_length // 2, 'f_min' : 0.0, 'f_max' : sample_rate // 2,
'pad': 0, 'window': WindowType.HANN, 'power': 2.0, 'normalized': False, 'center': True, 'pad_mode': BorderType.REFLECT,
'onesided': True, 'norm' : NormType.NONE, 'mel_scale' : MelType.HTK}` 。
异常:
- **TypeError** - 如果 `sample_rate` 的类型不为int。
- **TypeError** - 如果 `log_mels` 的类型不为bool。
- **TypeError** - 如果 `norm` 的类型不为 :class:`mindspore.dataset.audio.utils.NormMode`
- **TypeError** - 如果 `n_mfcc` 的类型不为int。
- **TypeError** - 如果 `melkwargs` 的类型不为dict。
- **ValueError** - 如果 `sample_rate` 为负数。
- **ValueError** - 如果 `n_mfcc` 为负数。
- **ValueError** - 如果 `dct_type` 不为2。

View File

@ -388,6 +388,7 @@ API样例中常用的导入模块如下
mindspore.dataset.audio.MaskAlongAxis
mindspore.dataset.audio.MaskAlongAxisIID
mindspore.dataset.audio.MelScale
mindspore.dataset.audio.MFCC
mindspore.dataset.audio.MuLawDecoding
mindspore.dataset.audio.MuLawEncoding
mindspore.dataset.audio.Overdrive

View File

@ -245,6 +245,7 @@ Transforms
mindspore.dataset.audio.MaskAlongAxis
mindspore.dataset.audio.MaskAlongAxisIID
mindspore.dataset.audio.MelScale
mindspore.dataset.audio.MFCC
mindspore.dataset.audio.MuLawDecoding
mindspore.dataset.audio.MuLawEncoding
mindspore.dataset.audio.Overdrive

View File

@ -48,6 +48,7 @@
#include "minddata/dataset/audio/ir/kernels/mask_along_axis_iid_ir.h"
#include "minddata/dataset/audio/ir/kernels/mask_along_axis_ir.h"
#include "minddata/dataset/audio/ir/kernels/mel_scale_ir.h"
#include "minddata/dataset/audio/ir/kernels/mfcc_ir.h"
#include "minddata/dataset/audio/ir/kernels/mu_law_decoding_ir.h"
#include "minddata/dataset/audio/ir/kernels/mu_law_encoding_ir.h"
#include "minddata/dataset/audio/ir/kernels/overdrive_ir.h"
@ -710,6 +711,71 @@ Status MelscaleFbanks(MSTensor *output, int32_t n_freqs, float f_min, float f_ma
return Status::OK();
}
// MFCC Transform Operation.
struct MFCC::Data {
Data(int32_t sample_rate, int32_t n_mfcc, int32_t dct_type, NormMode norm, bool log_mels, int32_t n_fft,
int32_t win_length, int32_t hop_length, float f_min, float f_max, int32_t pad, int32_t n_mels, WindowType window,
float power, bool normalized, bool center, BorderType pad_mode, bool onesided, NormType norm_mel,
MelType mel_scale)
: sample_rate_(sample_rate),
n_mfcc_(n_mfcc),
dct_type_(dct_type),
norm_(norm),
log_mels_(log_mels),
n_fft_(n_fft),
win_length_(win_length),
hop_length_(hop_length),
f_min_(f_min),
f_max_(f_max),
pad_(pad),
n_mels_(n_mels),
window_(window),
power_(power),
normalized_(normalized),
center_(center),
pad_mode_(pad_mode),
onesided_(onesided),
norm_mel_(norm_mel),
mel_scale_(mel_scale) {}
int32_t sample_rate_;
int32_t n_mfcc_;
int32_t dct_type_;
NormMode norm_;
bool log_mels_;
int32_t n_fft_;
int32_t win_length_;
int32_t hop_length_;
float f_min_;
float f_max_;
int32_t pad_;
int32_t n_mels_;
WindowType window_;
float power_;
bool normalized_;
bool center_;
BorderType pad_mode_;
bool onesided_;
NormType norm_mel_;
MelType mel_scale_;
std::map<std::string, std::string> melkwargs_;
};
MFCC::MFCC(int32_t sample_rate, int32_t n_mfcc, int32_t dct_type, NormMode norm, bool log_mels, int32_t n_fft,
int32_t win_length, int32_t hop_length, float f_min, float f_max, int32_t pad, int32_t n_mels,
WindowType window, float power, bool normalized, bool center, BorderType pad_mode, bool onesided,
NormType norm_mel, MelType mel_scale)
: data_(std::make_shared<Data>(sample_rate, n_mfcc, dct_type, norm, log_mels, n_fft, win_length, hop_length, f_min,
f_max, pad, n_mels, window, power, normalized, center, pad_mode, onesided, norm_mel,
mel_scale)) {}
std::shared_ptr<TensorOperation> MFCC::Parse() {
return std::make_shared<MFCCOperation>(data_->sample_rate_, data_->n_mfcc_, data_->dct_type_, data_->norm_,
data_->log_mels_, data_->n_fft_, data_->win_length_, data_->hop_length_,
data_->f_min_, data_->f_max_, data_->pad_, data_->n_mels_, data_->window_,
data_->power_, data_->normalized_, data_->center_, data_->pad_mode_,
data_->onesided_, data_->norm_mel_, data_->mel_scale_);
}
// MuLawDecoding Transform Operation.
struct MuLawDecoding::Data {
explicit Data(int32_t quantization_channels) : quantization_channels_(quantization_channels) {}

View File

@ -52,6 +52,7 @@
#include "minddata/dataset/audio/ir/kernels/mask_along_axis_iid_ir.h"
#include "minddata/dataset/audio/ir/kernels/mask_along_axis_ir.h"
#include "minddata/dataset/audio/ir/kernels/mel_scale_ir.h"
#include "minddata/dataset/audio/ir/kernels/mfcc_ir.h"
#include "minddata/dataset/audio/ir/kernels/mu_law_decoding_ir.h"
#include "minddata/dataset/audio/ir/kernels/mu_law_encoding_ir.h"
#include "minddata/dataset/audio/ir/kernels/overdrive_ir.h"
@ -479,6 +480,31 @@ PYBIND_REGISTER(MelScaleOperation, 1, ([](const py::module *m) {
}));
}));
PYBIND_REGISTER(MFCCOperation, 1, ([](const py::module *m) {
(void)py::class_<audio::MFCCOperation, TensorOperation, std::shared_ptr<audio::MFCCOperation>>(
*m, "MFCCOperation")
.def(py::init([](int32_t sample_rate, int32_t n_mfcc, int32_t dct_type, NormMode norm,
bool log_mels, const py::dict &melkwargs, WindowType window, BorderType pad_mode,
NormType norm_mel, MelType mel_scale) {
int32_t n_fft = py::cast<int>(melkwargs["n_fft"]);
int32_t win_length = py::cast<int>(melkwargs["win_length"]);
int32_t hop_length = py::cast<int>(melkwargs["hop_length"]);
float f_min = py::cast<float>(melkwargs["f_min"]);
float f_max = py::cast<float>(melkwargs["f_max"]);
int32_t pad = py::cast<int>(melkwargs["pad"]);
int32_t n_mels = py::cast<int>(melkwargs["n_mels"]);
float power = py::cast<float>(melkwargs["power"]);
bool normalized = py::cast<bool>(melkwargs["normalized"]);
bool center = py::cast<bool>(melkwargs["center"]);
bool onesided = py::cast<bool>(melkwargs["onesided"]);
auto mfcc = std::make_shared<audio::MFCCOperation>(
sample_rate, n_mfcc, dct_type, norm, log_mels, n_fft, win_length, hop_length, f_min, f_max, pad,
n_mels, window, power, normalized, center, pad_mode, onesided, norm_mel, mel_scale);
THROW_IF_ERROR(mfcc->ValidateParams());
return mfcc;
}));
}));
PYBIND_REGISTER(
MuLawDecodingOperation, 1, ([](const py::module *m) {
(void)py::class_<audio::MuLawDecodingOperation, TensorOperation, std::shared_ptr<audio::MuLawDecodingOperation>>(

View File

@ -34,6 +34,7 @@ add_library(audio-ir-kernels OBJECT
mask_along_axis_iid_ir.cc
mask_along_axis_ir.cc
mel_scale_ir.cc
mfcc_ir.cc
mu_law_decoding_ir.cc
mu_law_encoding_ir.cc
overdrive_ir.cc

View File

@ -0,0 +1,127 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "minddata/dataset/audio/ir/kernels/mfcc_ir.h"
#include "minddata/dataset/audio/ir/validators.h"
#include "minddata/dataset/audio/kernels/audio_utils.h"
#include "minddata/dataset/audio/kernels/mfcc_op.h"
namespace mindspore {
namespace dataset {
namespace audio {
MFCCOperation::MFCCOperation(int32_t sample_rate, int32_t n_mfcc, int32_t dct_type, NormMode norm, bool log_mels,
int32_t n_fft, int32_t win_length, int32_t hop_length, float f_min, float f_max,
int32_t pad, int32_t n_mels, WindowType window, float power, bool normalized, bool center,
BorderType pad_mode, bool onesided, NormType norm_mel, MelType mel_scale)
: sample_rate_(sample_rate),
n_mfcc_(n_mfcc),
dct_type_(dct_type),
norm_(norm),
log_mels_(log_mels),
n_fft_(n_fft),
win_length_(win_length),
hop_length_(hop_length),
f_min_(f_min),
f_max_(f_max),
pad_(pad),
n_mels_(n_mels),
window_(window),
power_(power),
normalized_(normalized),
center_(center),
pad_mode_(pad_mode),
onesided_(onesided),
norm_mel_(norm_mel),
mel_scale_(mel_scale) {}
MFCCOperation::~MFCCOperation() = default;
std::string MFCCOperation::Name() const { return kMFCCOperation; }
Status MFCCOperation::ValidateParams() {
RETURN_IF_NOT_OK(ValidateIntScalarNonNegative("MFCC", "sample_rate", sample_rate_));
RETURN_IF_NOT_OK(ValidateIntScalarNonNegative("MFCC", "n_mfcc", n_mfcc_));
CHECK_FAIL_RETURN_UNEXPECTED(dct_type_ == TWO,
"MFCC: dct_type must be equal to 2, but got: " + std::to_string(dct_type_));
RETURN_IF_NOT_OK(ValidateFloatScalarNonNegative("MFCC", "f_max", f_max_));
CHECK_FAIL_RETURN_UNEXPECTED(n_mfcc_ <= n_mels_,
"MFCC: n_mels should be greater than or equal to n_mfcc, but got n_mfcc: " +
std::to_string(n_mfcc_) + " and n_mels: " + std::to_string(n_mels_));
// MelSpectrogram params
RETURN_IF_NOT_OK(ValidateIntScalarNonNegative("MFCC", "n_mels", n_mels_));
RETURN_IF_NOT_OK(ValidateIntScalarPositive("MFCC", "n_fft", n_fft_));
RETURN_IF_NOT_OK(ValidateIntScalarNonNegative("MFCC", "win_length", win_length_));
RETURN_IF_NOT_OK(ValidateIntScalarNonNegative("MFCC", "hop_length", hop_length_));
RETURN_IF_NOT_OK(ValidateIntScalarNonNegative("MFCC", "pad", pad_));
RETURN_IF_NOT_OK(ValidateIntScalarPositive("MFCC", "power", power_));
RETURN_IF_NOT_OK(ValidateIntScalarNonNegative("MFCC", "n_mels", n_mels_));
CHECK_FAIL_RETURN_UNEXPECTED(pad_mode_ != BorderType::kEdge, "MFCC: invalid BorderType, kEdge is not supported.");
if (f_max_ != 0) {
RETURN_IF_NOT_OK(ValidateFloatScalarNonNegative("MFCC", "f_max", f_max_));
CHECK_FAIL_RETURN_UNEXPECTED(f_min_ <= f_max_,
"MFCC: f_max must be greater than or equal to f_min, but got "
"f_max: " +
std::to_string(f_max_) + " and f_min: " + std::to_string(f_min_));
} else {
CHECK_FAIL_RETURN_UNEXPECTED(f_min_ < (sample_rate_ * HALF),
"MFCC: f_min must be less than half of sample_rate when f_max is 0, but got"
" f_min: " +
std::to_string(f_min_));
}
CHECK_FAIL_RETURN_UNEXPECTED(win_length_ <= n_fft_,
"MFCC: win_length must be less than or equal to n_fft, but got win_length: " +
std::to_string(win_length_) + ", n_fft: " + std::to_string(n_fft_));
return Status::OK();
}
std::shared_ptr<TensorOp> MFCCOperation::Build() {
win_length_ = win_length_ == 0 ? n_fft_ : win_length_;
hop_length_ = hop_length_ == 0 ? (win_length_ / TWO) : hop_length_;
f_max_ = f_max_ == 0 ? (sample_rate_ / TWO) : f_max_;
std::shared_ptr<MFCCOp> tensor_op = std::make_shared<MFCCOp>(
sample_rate_, n_mfcc_, dct_type_, log_mels_, n_fft_, win_length_, hop_length_, f_min_, f_max_, pad_, n_mels_,
window_, power_, normalized_, center_, pad_mode_, onesided_, norm_mel_, norm_, mel_scale_);
return tensor_op;
}
Status MFCCOperation::to_json(nlohmann::json *out_json) {
nlohmann::json args;
args["sample_rate"] = sample_rate_;
args["n_mfcc"] = n_mfcc_;
args["dct_type"] = dct_type_;
args["norm"] = norm_;
args["log_mels"] = log_mels_;
args["n_fft"] = n_fft_;
args["win_length"] = win_length_;
args["hop_length"] = hop_length_;
args["f_min"] = f_min_;
args["f_max"] = f_max_;
args["pad"] = pad_;
args["n_mels"] = n_mels_;
args["window"] = window_;
args["power"] = power_;
args["normalized"] = normalized_;
args["center"] = center_;
args["pad_mode"] = pad_mode_;
args["onesided"] = onesided_;
args["norm_mel"] = norm_mel_;
args["mel_scale"] = mel_scale_;
*out_json = args;
return Status::OK();
}
} // namespace audio
} // namespace dataset
} // namespace mindspore

View File

@ -0,0 +1,96 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_IR_KERNELS_MFCC_IR_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_IR_KERNELS_MFCC_IR_H_
#include <map>
#include <memory>
#include <string>
#include "include/api/status.h"
#include "minddata/dataset/include/dataset/constants.h"
#include "minddata/dataset/kernels/ir/tensor_operation.h"
namespace mindspore {
namespace dataset {
namespace audio {
constexpr char kMFCCOperation[] = "MFCC";
class MFCCOperation : public TensorOperation {
public:
/// \brief Constructor.
/// \param[in] sample_rate Sample rate of audio signal.
/// \param[in] n_mfcc Number of mfc coefficients to retain.
/// \param[in] dct_type Type of DCT (discrete cosine transform) to use.
/// \param[in] log_mels Whether to use log-mel spectrograms instead of db-scaled.
/// \param[in] n_fft Size of FFT, creates n_fft // 2 + 1 bins.
/// \param[in] win_length Window size.
/// \param[in] hop_length Length of hop between STFT windows.
/// \param[in] f_min Minimum frequency.
/// \param[in] f_max Maximum frequency.
/// \param[in] pad Two sided padding of signal.
/// \param[in] n_mels Number of mel filterbanks.
/// \param[in] window A function to create a window tensor that is applied/multiplied to each frame/window.
/// \param[in] power Exponent for the magnitude spectrogram, (must be > 0) e.g., 1 for energy, 2 for power, etc.
/// \param[in] normalized Whether to normalize by magnitude after stft.
/// \param[in] center Whether to pad waveform on both sides.
/// \param[in] pad_mode Controls the padding method used when center is True.
/// \param[in] onesided Controls whether to return half of results to avoid redundancy.
/// \param[in] norm_mel Norm to use.
/// \param[in] norm If 'slaney', divide the triangular mel weights by the width of the mel band (area normalization).
/// \param[in] mel_scale Scale to use: htk or slaney.
MFCCOperation(int32_t sample_rate, int32_t n_mfcc, int32_t dct_type, NormMode norm, bool log_mels, int32_t n_fft,
int32_t win_length, int32_t hop_length, float f_min, float f_max, int32_t pad, int32_t n_mels,
WindowType window, float power, bool normalized, bool center, BorderType pad_mode, bool onesided,
NormType norm_mel, MelType mel_scale);
~MFCCOperation();
std::shared_ptr<TensorOp> Build() override;
Status ValidateParams() override;
std::string Name() const override;
Status to_json(nlohmann::json *out_json) override;
private:
int32_t sample_rate_;
int32_t n_mfcc_;
int32_t dct_type_;
NormMode norm_;
bool log_mels_;
int32_t n_fft_;
int32_t win_length_;
int32_t hop_length_;
float f_min_;
float f_max_;
int32_t pad_;
int32_t n_mels_;
WindowType window_;
float power_;
bool normalized_;
bool center_;
BorderType pad_mode_;
bool onesided_;
NormType norm_mel_;
MelType mel_scale_;
};
} // namespace audio
} // namespace dataset
} // namespace mindspore
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_IR_KERNELS_MFCC_IR_H_

View File

@ -35,6 +35,7 @@ add_library(audio-kernels OBJECT
mask_along_axis_iid_op.cc
mask_along_axis_op.cc
mel_scale_op.cc
mfcc_op.cc
mu_law_decoding_op.cc
mu_law_encoding_op.cc
overdrive_op.cc

View File

@ -2230,5 +2230,77 @@ Status LFCC(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *outpu
return Status::OK();
}
Status MelSpectrogram(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate,
int32_t n_fft, int32_t win_length, int32_t hop_length, float f_min, float f_max, int32_t pad,
int32_t n_mels, WindowType window, float power, bool normalized, bool center, BorderType pad_mode,
bool onesided, NormType norm, MelType mel_scale) {
auto input_shape_vec = input->shape().AsVector();
CHECK_FAIL_RETURN_UNEXPECTED(n_fft < TWO * input_shape_vec[input_shape_vec.size() - 1],
"MelSpectrogram: Padding size should be less than the corresponding input dimension.");
RETURN_UNEXPECTED_IF_NULL(input);
RETURN_UNEXPECTED_IF_NULL(output);
std::shared_ptr<Tensor> spectrogram;
RETURN_IF_NOT_OK(Spectrogram(input, &spectrogram, pad, window, n_fft, hop_length, win_length, power, normalized,
center, pad_mode, onesided));
RETURN_IF_NOT_OK(
MelScale<float>(spectrogram, output, n_mels, sample_rate, f_min, f_max, n_fft / TWO + 1, norm, mel_scale));
return Status::OK();
}
Status MFCC(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate, int32_t n_mfcc,
int32_t dct_type, bool log_mels, int32_t n_fft, int32_t win_length, int32_t hop_length, float f_min,
float f_max, int32_t pad, int32_t n_mels, WindowType window, float power, bool normalized, bool center,
BorderType pad_mode, bool onesided, NormType norm, NormMode norm_M, MelType mel_scale) {
RETURN_UNEXPECTED_IF_NULL(input);
RETURN_UNEXPECTED_IF_NULL(output);
std::shared_ptr<Tensor> mel_spectrogram;
std::shared_ptr<Tensor> dct_mat;
RETURN_IF_NOT_OK(MelSpectrogram(input, &mel_spectrogram, sample_rate, n_fft, win_length, hop_length, f_min, f_max,
pad, n_mels, window, power, normalized, center, pad_mode, onesided, norm, mel_scale));
RETURN_IF_NOT_OK(Dct(&dct_mat, n_mfcc, n_mels, norm_M));
if (log_mels) {
for (auto itr = mel_spectrogram->begin<float>(); itr != mel_spectrogram->end<float>(); ++itr) {
float log_offset = 1e-6;
*itr = log(*itr + log_offset);
}
} else {
std::shared_ptr<Tensor> amplitude_to_db;
float multiplier = 10.0;
float db_multiplier = 0.0;
float amin = 1e-10;
float top_db = 80.0;
RETURN_IF_NOT_OK(AmplitudeToDB(mel_spectrogram, &amplitude_to_db, multiplier, amin, db_multiplier, top_db));
mel_spectrogram = amplitude_to_db;
}
auto dct_mat_ptr = &*dct_mat->begin<float>();
Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> mat_res;
Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>> matrix_dm(dct_mat_ptr, n_mfcc, n_mels);
TensorShape st_shape = mel_spectrogram->shape();
TensorShape st_reshape({mel_spectrogram->Size() / st_shape[-1] / st_shape[-2], st_shape[-2], st_shape[-1]});
RETURN_IF_NOT_OK(mel_spectrogram->Reshape(st_reshape));
const dsize_t kRowIndex = 1;
const dsize_t kColIndex = 2;
int rows = st_reshape[kRowIndex];
int cols = st_reshape[kColIndex];
std::vector<float> out_temp;
for (int c = 0; c < st_reshape[0]; c++) {
Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>> matrix_c(
&*mel_spectrogram->begin<float>() + rows * cols * c, cols, rows);
mat_res.noalias() = (matrix_c * matrix_dm.transpose());
std::vector<float> vec_c(mat_res.data(), mat_res.data() + mat_res.size());
out_temp.insert(out_temp.end(), vec_c.begin(), vec_c.end());
}
// unpack
std::vector<int64_t> output_shape_vec = st_shape.AsVector();
output_shape_vec[st_shape.Size() - 1] = cols;
output_shape_vec[st_shape.Size() - TWO] = n_mfcc;
TensorShape output_shape(output_shape_vec);
RETURN_IF_NOT_OK(Tensor::CreateFromVector(out_temp, output_shape, output));
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

View File

@ -38,6 +38,7 @@ constexpr double PI = 3.141592653589793;
constexpr int kMinAudioDim = 1;
constexpr int kDefaultAudioDim = 2;
constexpr int TWO = 2;
constexpr float HALF = 0.5;
namespace mindspore {
namespace dataset {
@ -2140,6 +2141,60 @@ Status LFCC(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *outpu
int32_t n_filter, int32_t n_lfcc, int32_t dct_type, bool log_lf, int32_t n_fft, int32_t win_length,
int32_t hop_length, float f_min, float f_max, int32_t pad, WindowType window, float power, bool normalized,
bool center, BorderType pad_mode, bool onesided, NormMode norm);
/// \brief Create MelSpectrogram for a raw audio signal.
/// \param[in] input Input tensor.
/// \param[out] output Output tensor.
/// \param[in] sample_rate Sample rate of audio signal.
/// \param[in] n_fft Size of FFT, creates n_fft // 2 + 1 bins.
/// \param[in] win_length Window size.
/// \param[in] hop_length Length of hop between STFT windows.
/// \param[in] f_min Minimum frequency, which must be non negative.
/// \param[in] f_max Maximum frequency, which must be positive.
/// \param[in] pad Two sided padding of signal.
/// \param[in] n_mels Number of mel filter, which must be positive.
/// \param[in] window A function to create a window tensor that is applied/multiplied to each frame/window.
/// \param[in] power Exponent for the magnitude spectrogram, (must be > 0) e.g., 1 for energy, 2 for power, etc.
/// \param[in] normalized Whether to normalize by magnitude after stft.
/// \param[in] center Whether to pad waveform on both sides.
/// \param[in] pad_mode controls the padding method used when center is True.
/// \param[in] onesided controls whether to return half of results to avoid redundancy.
/// \param[in] norm If 'slaney', divide the triangular mel weights by the width of the mel band (area normalization).
/// \param[in] mel_scale Scale to use: htk or slaney.
/// \return Status return code.
Status MelSpectrogram(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate,
int32_t n_fft, int32_t win_length, int32_t hop_length, float f_min, float f_max, int32_t pad,
int32_t n_mels, WindowType window, float power, bool normalized, bool center, BorderType pad_mode,
bool onesided, NormType norm, MelType mel_scale);
/// \brief Create MFCC for a raw audio signal.
/// \param[in] input Input tensor.
/// \param[out] output Output tensor.
/// \param[in] sample_rate Sample rate of audio signal.
/// \param[in] n_mfcc Number of mfc coefficients to retain.
/// \param[in] dct_type Type of DCT (discrete cosine transform) to use.
/// \param[in] log_mels Whether to use log-mel spectrograms instead of db-scaled.
/// \param[in] n_fft Size of FFT, creates n_fft // 2 + 1 bins.
/// \param[in] win_length Window size.
/// \param[in] hop_length Length of hop between STFT windows.
/// \param[in] f_min Minimum frequency.
/// \param[in] f_max Maximum frequency.
/// \param[in] pad Two sided padding of signal.
/// \param[in] n_mels Number of mel filterbanks.
/// \param[in] window A function to create a window tensor that is applied/multiplied to each frame/window.
/// \param[in] power Exponent for the magnitude spectrogram, (must be > 0) e.g., 1 for energy, 2 for power, etc.
/// \param[in] normalized Whether to normalize by magnitude after stft.
/// \param[in] center Whether to pad waveform on both sides.
/// \param[in] pad_mode Controls the padding method used when center is True.
/// \param[in] onesided Controls whether to return half of results to avoid redundancy.
/// \param[in] norm Norm to use.
/// \param[in] norm_M If 'slaney', divide the triangular mel weights by the width of the mel band (area normalization).
/// \param[in] mel_scale Scale to use: htk or slaney.
/// \return Status return code.
Status MFCC(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate, int32_t n_mfcc,
int32_t dct_type, bool log_mels, int32_t n_fft, int32_t win_length, int32_t hop_length, float f_min,
float f_max, int32_t pad, int32_t n_mels, WindowType window, float power, bool normalized, bool center,
BorderType pad_mode, bool onesided, NormType norm, NormMode norm_M, MelType mel_scale);
} // namespace dataset
} // namespace mindspore
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_AUDIO_UTILS_H_

View File

@ -0,0 +1,57 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "minddata/dataset/audio/kernels/mfcc_op.h"
#include "minddata/dataset/audio/kernels/audio_utils.h"
#include "minddata/dataset/core/tensor.h"
#include "minddata/dataset/kernels/data/data_utils.h"
#include "minddata/dataset/kernels/tensor_op.h"
#include "minddata/dataset/util/status.h"
namespace mindspore {
namespace dataset {
Status MFCCOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output);
return MFCC(input, output, sample_rate_, n_mfcc_, dct_type_, log_mels_, n_fft_, win_length_, hop_length_, f_min_,
f_max_, pad_, n_mels_, window_, power_, normalized_, center_, pad_mode_, onesided_, norm_, norm_M_,
mel_scale_);
}
Status MFCCOp::OutputShape(const std::vector<TensorShape> &inputs, std::vector<TensorShape> &outputs) {
RETURN_IF_NOT_OK(TensorOp::OutputShape(inputs, outputs));
outputs.clear();
auto output_shape_vector = inputs[0].AsVector();
auto time = output_shape_vector[output_shape_vector.size()];
output_shape_vector.pop_back();
output_shape_vector.push_back(n_mfcc_);
output_shape_vector.push_back(time);
TensorShape out = TensorShape(output_shape_vector);
outputs.emplace_back(out);
if (!outputs.empty()) {
return Status::OK();
}
return Status(StatusCode::kMDUnexpectedError, "MFCC: input tensor is not in shape of <..., time>.");
}
Status MFCCOp::OutputType(const std::vector<DataType> &inputs, std::vector<DataType> &outputs) {
RETURN_IF_NOT_OK(TensorOp::OutputType(inputs, outputs));
RETURN_IF_NOT_OK(ValidateTensorType("MFCC", inputs[0].IsNumeric(), "[float]", inputs[0].ToString()));
outputs[0] = DataType(DataType::DE_FLOAT32);
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

View File

@ -0,0 +1,113 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_MFCC_OP_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_MFCC_OP_H_
#include <memory>
#include <string>
#include <vector>
#include "include/dataset/constants.h"
#include "minddata/dataset/core/tensor.h"
#include "minddata/dataset/kernels/tensor_op.h"
namespace mindspore {
namespace dataset {
class MFCCOp : public TensorOp {
public:
/// \brief Constructor.
/// \param[in] sample_rate Sample rate of audio signal.
/// \param[in] n_mfcc Number of mfc coefficients to retain.
/// \param[in] dct_type Type of DCT (discrete cosine transform) to use.
/// \param[in] log_mels Whether to use log-mel spectrograms instead of db-scaled.
/// \param[in] n_fft Size of FFT, creates n_fft // 2 + 1 bins.
/// \param[in] win_length Window size.
/// \param[in] hop_length Length of hop between STFT windows.
/// \param[in] f_min Minimum frequency.
/// \param[in] f_max Maximum frequency.
/// \param[in] pad Two sided padding of signal.
/// \param[in] n_mels Number of mel filterbanks.
/// \param[in] window A function to create a window tensor that is applied/multiplied to each frame/window.
/// \param[in] power Exponent for the magnitude spectrogram, (must be > 0) e.g., 1 for energy, 2 for power, etc.
/// \param[in] normalized Whether to normalize by magnitude after stft.
/// \param[in] center Whether to pad waveform on both sides.
/// \param[in] pad_mode Controls the padding method used when center is True.
/// \param[in] onesided Controls whether to return half of results to avoid redundancy.
/// \param[in] norm Norm to use.
/// \param[in] norm_M If 'slaney', divide the triangular mel weights by the width of the mel band (area
/// normalization).
/// \param[in] mel_scale Scale to use: htk or slaney.
MFCCOp(int32_t sample_rate, int32_t n_mfcc, int32_t dct_type, bool log_mels, int32_t n_fft, int32_t win_length,
int32_t hop_length, float f_min, float f_max, int32_t pad, int32_t n_mels, WindowType window, float power,
bool normalized, bool center, BorderType pad_mode, bool onesided, NormType norm, NormMode norm_M,
MelType mel_scale)
: sample_rate_(sample_rate),
n_mfcc_(n_mfcc),
dct_type_(dct_type),
log_mels_(log_mels),
n_fft_(n_fft),
win_length_(win_length),
hop_length_(hop_length),
f_min_(f_min),
f_max_(f_max),
pad_(pad),
n_mels_(n_mels),
window_(window),
power_(power),
normalized_(normalized),
center_(center),
pad_mode_(pad_mode),
onesided_(onesided),
norm_(norm),
norm_M_(norm_M),
mel_scale_(mel_scale) {}
~MFCCOp() override = default;
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
std::string Name() const override { return kMFCCOp; }
Status OutputShape(const std::vector<TensorShape> &inputs, std::vector<TensorShape> &outputs) override;
Status OutputType(const std::vector<DataType> &inputs, std::vector<DataType> &outputs) override;
private:
int32_t sample_rate_;
int32_t n_mfcc_;
int32_t dct_type_;
bool log_mels_;
int32_t n_fft_;
int32_t win_length_;
int32_t hop_length_;
float f_min_;
float f_max_;
int32_t pad_;
int32_t n_mels_;
WindowType window_;
float power_;
bool normalized_;
bool center_;
BorderType pad_mode_;
bool onesided_;
NormType norm_;
NormMode norm_M_;
MelType mel_scale_;
};
} // namespace dataset
} // namespace mindspore
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_MFCC_OP_H_

View File

@ -867,6 +867,52 @@ Status DATASET_API MelscaleFbanks(MSTensor *output, int32_t n_freqs, float f_min
int32_t sample_rate, NormType norm = NormType::kNone,
MelType mel_type = MelType::kHtk);
/// \brief Create MFCC for a raw audio signal.
class DATASET_API MFCC final : public TensorTransform {
public:
/// \param[in] sample_rate Sample rate of audio signal. Default: 16000.
/// \param[in] n_mfcc Number of mfc coefficients to retain. Default: 40.
/// \param[in] dct_type Type of DCT (discrete cosine transform) to use. Default: 2.
/// \param[in] norm If 'slaney', divide the triangular mel weights by the width of the mel band (area normalization).
/// Default: NormMode::kOrtho.
/// \param[in] log_mels Whether to use log-mel spectrograms instead of db-scaled. Default: false.
/// \param[in] n_fft Size of FFT, creates n_fft // 2 + 1 bins. Default: 400.
/// \param[in] win_length Window size. Default: 0.
/// \param[in] hop_length Length of hop between STFT windows. Default: 0.
/// \param[in] f_min Minimum frequency. Default: 0.
/// \param[in] f_max Maximum frequency. Default: 0.
/// \param[in] pad Two sided padding of signal. Default: 0.
/// \param[in] n_mels Number of mel filterbanks. Default: 128.
/// \param[in] window A function to create a window tensor that is applied/multiplied to each frame/window.
/// Default: WindowType::kHann.
/// \param[in] power Exponent for the magnitude spectrogram, (must be > 0) e.g., 1 for energy, 2 for power, etc.
/// Default: 2.0.
/// \param[in] normalized Whether to normalize by magnitude after stft. Default: false.
/// \param[in] center Whether to pad waveform on both sides. Default: true.
/// \param[in] pad_mode Controls the padding method used when center is True. Default: BorderType::kReflect.
/// \param[in] onesided Controls whether to return half of results to avoid redundancy. Default: true.
/// \param[in] norm_mel Norm to use. Default: NormType::kNone.
/// \param[in] mel_scale Scale to use: htk or slaney. Default: MelType::kHtk.
explicit MFCC(int32_t sample_rate = 16000, int32_t n_mfcc = 40, int32_t dct_type = 2,
NormMode norm = NormMode::kOrtho, bool log_mels = false, int32_t n_fft = 400, int32_t win_length = 0,
int32_t hop_length = 0, float f_min = 0, float f_max = 0, int32_t pad = 0, int32_t n_mels = 128,
WindowType window = WindowType::kHann, float power = 2.0, bool normalized = false, bool center = true,
BorderType pad_mode = BorderType::kReflect, bool onesided = true, NormType norm_mel = NormType::kNone,
MelType mel_scale = MelType::kHtk);
/// \brief Destructor.
~MFCC() override = default;
protected:
/// \brief Function to convert TensorTransform object into a TensorOperation object.
/// \return Shared pointer to TensorOperation object.
std::shared_ptr<TensorOperation> Parse() override;
private:
struct Data;
std::shared_ptr<Data> data_;
};
/// \brief MuLawDecoding TensorTransform.
/// \note Decode mu-law encoded signal.
class DATASET_API MuLawDecoding final : public TensorTransform {

View File

@ -192,6 +192,7 @@ constexpr char kMagphaseOp[] = "MagphaseOp";
constexpr char kMaskAlongAxisIIDOp[] = "MaskAlongAxisIIDOp";
constexpr char kMaskAlongAxisOp[] = "MaskAlongAxisOp";
constexpr char kMelScaleOp[] = "MelScaleOp";
constexpr char kMFCCOp[] = "MFCCOp";
constexpr char kMuLawDecodingOp[] = "MuLawDecodingOp";
constexpr char kMuLawEncodingOp[] = "MuLawEncodingOp";
constexpr char kOverdriveOp[] = "OverdriveOp";

View File

@ -68,7 +68,7 @@ from mindspore.dataset.audio.transforms import AllpassBiquad, AmplitudeToDB, Ang
BandpassBiquad, BandrejectBiquad, BassBiquad, Biquad, \
ComplexNorm, ComputeDeltas, Contrast, DBToAmplitude, DCShift, DeemphBiquad, DetectPitchFrequency, Dither, \
EqualizerBiquad, Fade, Flanger, FrequencyMasking, Gain, GriffinLim, HighpassBiquad, InverseMelScale, LFCC, \
LFilter, LowpassBiquad, Magphase, MaskAlongAxis, MaskAlongAxisIID, MelScale, MuLawDecoding, MuLawEncoding, \
LFilter, LowpassBiquad, Magphase, MaskAlongAxis, MaskAlongAxisIID, MelScale, MFCC, MuLawDecoding, MuLawEncoding, \
Overdrive, Phaser, PhaseVocoder, Resample, RiaaBiquad, SlidingWindowCmn, SpectralCentroid, Spectrogram, \
TimeMasking, TimeStretch, TrebleBiquad, Vad, Vol
from mindspore.dataset.audio.utils import BorderType, DensityFunction, FadeShape, GainType, Interpolation, \

View File

@ -29,9 +29,10 @@ from .validators import check_allpass_biquad, check_amplitude_to_db, check_band_
check_contrast, check_db_to_amplitude, check_dc_shift, check_deemph_biquad, check_detect_pitch_frequency, \
check_dither, check_equalizer_biquad, check_fade, check_flanger, check_gain, check_griffin_lim, \
check_highpass_biquad, check_inverse_mel_scale, check_lfcc, check_lfilter, check_lowpass_biquad, check_magphase, \
check_mask_along_axis, check_mask_along_axis_iid, check_masking, check_mel_scale, check_mu_law_coding, \
check_overdrive, check_phase_vocoder, check_phaser, check_resample, check_riaa_biquad, check_sliding_window_cmn, \
check_spectral_centroid, check_spectrogram, check_time_stretch, check_treble_biquad, check_vad, check_vol
check_mask_along_axis, check_mask_along_axis_iid, check_masking, check_mel_scale, check_mfcc, \
check_mu_law_coding, check_overdrive, check_phase_vocoder, check_phaser, check_resample, check_riaa_biquad, \
check_sliding_window_cmn, check_spectral_centroid, check_spectrogram, check_time_stretch, check_treble_biquad, \
check_vad, check_vol
from ..transforms.py_transforms_util import Implementation
from ..transforms.transforms import TensorOperation
@ -1516,6 +1517,83 @@ class MelScale(AudioTensorOperation):
DE_C_NORM_TYPE.get(self.norm), DE_C_MEL_TYPE.get(self.mel_type))
class MFCC(AudioTensorOperation):
"""
Create MFCC for a raw audio signal.
Args:
sample_rate (int, optional): Sampling rate of audio signal (in Hz), can't be less than 0. Default: 16000.
n_mfcc (int, optional): Number of mfc coefficients to retain, can't be less than 0. Default: 40.
dct_type (int, optional): Type of DCT (discrete cosine transform) to use, can only be 2. Default: 2.
norm (NormMode, optional): Norm to use. Default: NormMode.ORTHO.
log_mels (bool, optional): Whether to use log-mel spectrograms instead of db-scaled. Default: False.
melkwargs (dict, optional): Arguments for Spectrogram. Default: None, will be set to
`{'n_fft': 400, 'win_length': n_fft, 'hop_length': win_length // 2, 'f_min' : 0.0,
'f_max' : sample_rate // 2, 'pad': 0, 'window': WindowType.HANN, 'power': 2.0, 'normalized': False,
'center': True, 'pad_mode': BorderType.REFLECT, 'onesided': True, 'norm' : NormType.NONE,
'mel_scale' : MelType.HTK}` .
Raises:
TypeError: If `sample_rate` is not of type int.
TypeError: If `log_mels` is not of type bool.
TypeError: If `norm` is not of type :class:`mindspore.dataset.audio.utils.NormMode` .
TypeError: If `n_mfcc` is not of type int.
TypeError: If `melkwargs` is not of type dict.
ValueError: If `sample_rate` is a negative number.
ValueError: If `n_mfcc` is a negative number.
ValueError: If `dct_type` is not 2.
Supported Platforms:
``CPU``
Examples:
>>> import numpy as np
>>>
>>> waveform = np.array([[0.8236, 0.2049, 0.3335], [0.5933, 0.9911, 0.2482],
... [0.3007, 0.9054, 0.7598], [0.5394, 0.2842, 0.5634], [0.6363, 0.2226, 0.2288]])
>>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
>>> transforms = [audio.MFCC(4000, 1500, 0.7)]
>>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
"""
@check_mfcc
def __init__(self, sample_rate=16000, n_mfcc=40, dct_type=2, norm=NormMode.ORTHO, log_mels=False, melkwargs=None):
super().__init__()
self.sample_rate = sample_rate
self.n_mfcc = n_mfcc
self.dct_type = dct_type
self.norm = norm
self.log_mels = log_mels
self.melkwargs = melkwargs
if melkwargs is None:
self.melkwargs = {}
self.melkwargs.setdefault("n_fft", 400)
self.melkwargs.setdefault("win_length", self.melkwargs.get("n_fft"))
self.melkwargs.setdefault("hop_length", self.melkwargs.get("win_length") // 2)
self.melkwargs.setdefault("f_min", 0.0)
self.melkwargs.setdefault("f_max", sample_rate // 2)
self.melkwargs.setdefault("pad", 0)
self.melkwargs.setdefault("n_mels", 128)
self.melkwargs.setdefault("window", WindowType.HANN)
self.melkwargs.setdefault("power", 2.0)
self.melkwargs.setdefault("normalized", False)
self.melkwargs.setdefault("center", True)
self.melkwargs.setdefault("pad_mode", BorderType.REFLECT)
self.melkwargs.setdefault("onesided", True)
self.melkwargs.setdefault("norm", NormType.NONE)
self.melkwargs.setdefault("mel_scale", MelType.HTK)
self.window = self.melkwargs.get("window")
self.pad_mode = self.melkwargs.get("pad_mode")
self.norm_mel = self.melkwargs.get("norm")
self.mel_scale = self.melkwargs.get("mel_scale")
def parse(self):
return cde.MFCCOperation(self.sample_rate, self.n_mfcc, self.dct_type, DE_C_NORM_MODE.get(self.norm),
self.log_mels, self.melkwargs, DE_C_WINDOW_TYPE.get(self.window),
DE_C_BORDER_TYPE.get(self.pad_mode), DE_C_NORM_TYPE.get(self.norm_mel),
DE_C_MEL_TYPE.get(self.mel_scale))
class MuLawDecoding(AudioTensorOperation):
"""
Decode mu-law encoded signal, refer to `mu-law algorithm <https://en.wikipedia.org/wiki/M-law_algorithm>`_ .

View File

@ -1006,3 +1006,61 @@ def check_lfcc(method):
return method(self, *args, **kwargs)
return new_method
def check_mfcc(method):
"""Wrapper method to check the parameters of MFCC."""
@wraps(method)
def new_method(self, *args, **kwargs):
[sample_rate, n_mfcc, dct_type, norm, log_mels, melkwargs], _ = parse_user_args(method, *args, **kwargs)
check_non_negative_int32(sample_rate, "sample_rate")
type_check(log_mels, (bool,), "log_mels")
type_check(norm, (NormMode,), "norm")
check_non_negative_int32(n_mfcc, "n_mfcc")
if dct_type != 2:
raise ValueError("Input dct_type must be 2, but got : {0}.".format(dct_type))
if melkwargs is not None:
type_check(melkwargs, (dict,), "melkwargs")
n_fft = melkwargs["n_fft"]
win_length = melkwargs["win_length"]
hop_length = melkwargs["hop_length"]
f_min = melkwargs["f_min"]
f_max = melkwargs["f_max"]
pad = melkwargs["pad"]
power = melkwargs["power"]
normalized = melkwargs["normalized"]
center = melkwargs["center"]
onesided = melkwargs["onesided"]
window = melkwargs["window"]
pad_mode = melkwargs["pad_mode"]
norm_mel = melkwargs["norm"]
mel_scale = melkwargs["mel_scale"]
n_mels = melkwargs["n_mels"]
check_pos_int32(n_fft, "n_fft")
check_mel_scale_n_mels(n_mels)
check_mel_scale_freq(f_min, f_max, sample_rate)
check_mel_scale_norm(norm_mel)
check_mel_scale_mel_type(mel_scale)
check_power(power)
type_check(window, (WindowType,), "window")
type_check(normalized, (bool,), "normalized")
type_check(center, (bool,), "center")
type_check(pad_mode, (BorderType,), "pad_mode")
type_check(onesided, (bool,), "onesided")
check_non_negative_int32(pad, "pad")
if hop_length is not None:
check_pos_int32(hop_length, "hop_length")
if f_max is not None:
check_non_negative_float32(f_max, "f_max")
if win_length is not None:
check_non_negative_int32(win_length, "win_length")
if n_mels < n_mfcc:
raise ValueError("Input n_mels should be greater than or equal to n_mfcc, but got n_mfcc: {0} and " \
"n_mels: {1}.".format(n_mfcc, n_mels))
return method(self, *args, **kwargs)
return new_method

View File

@ -3148,3 +3148,56 @@ TEST_F(MindDataTestPipeline, TestLFCCWrongArgs) {
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_EQ(iter, nullptr);
}
/// Feature: MFCC op
/// Description: Test pipeline for MFCC op
/// Expectation: Generate expected output after cases were executed
TEST_F(MindDataTestPipeline, TestMFCCPipeline) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMFCCPipeline.";
// Original waveform
std::shared_ptr<SchemaObj> schema = Schema();
ASSERT_OK(schema->add_column("waveform", mindspore::DataType::kNumberTypeFloat32, {1, 1, 300}));
std::shared_ptr<Dataset> ds = RandomData(10, schema);
EXPECT_NE(ds, nullptr);
ds = ds->SetNumWorkers(4);
EXPECT_NE(ds, nullptr);
auto mfcc_op1 = audio::MFCC(16000, 40, 2, NormMode::kOrtho, true);
ds = ds->Map({mfcc_op1});
EXPECT_NE(ds, nullptr);
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(ds, nullptr);
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));
std::vector<int64_t> expected = {1, 1, 40, 2};
int i = 0;
while (row.size() != 0) {
auto col = row["waveform"];
ASSERT_EQ(col.Shape(), expected);
ASSERT_EQ(col.Shape().size(), 4);
ASSERT_EQ(col.DataType(), mindspore::DataType::kNumberTypeFloat32);
ASSERT_OK(iter->GetNextRow(&row));
i++;
}
EXPECT_EQ(i, 10);
iter->Stop();
}
/// Feature: MFCC op
/// Description: Test wrong arguments for MFCC op
/// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline, TestMFCCWrongArgs) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestInverseMelScaleWrongArgs.";
// MFCC: negative sample_rate.
std::shared_ptr<SchemaObj> schema = Schema();
ASSERT_OK(schema->add_column("waveform", mindspore::DataType::kNumberTypeFloat32, {1, 1, 300}));
std::shared_ptr<Dataset> ds = RandomData(10, schema);
EXPECT_NE(ds, nullptr);
ds = ds->SetNumWorkers(4);
EXPECT_NE(ds, nullptr);
auto mfcc_op0 = audio::MFCC(-1);
ds = ds->Map({mfcc_op0});
EXPECT_NE(ds, nullptr);
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_EQ(iter, nullptr);
}

View File

@ -3053,3 +3053,22 @@ TEST_F(MindDataTestExecute, TestTruncateOpStr) {
Status status = trans(input_ms, &input_ms);
EXPECT_TRUE(status.IsOk());
}
/// Feature: MFCC op
/// Description: Test basic usage of MFCC op
/// Expectation: The data is processed successfully
TEST_F(MindDataTestExecute, TestMFCCEager) {
MS_LOG(INFO) << "Doing MindDataTestExecute-TestMFCC.";
// Original waveform
std::vector<float> labels = {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 4, 4, 3, 3, 2,
2, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5};
std::shared_ptr<Tensor> input;
ASSERT_OK(Tensor::CreateFromVector(labels, TensorShape({1, 1, 30}), &input));
auto input_ms = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(input));
std::shared_ptr<TensorTransform> mfcc_op =
std::make_shared<audio::MFCC>(16000, 4, 2, NormMode::kOrtho, true, 10);
// apply MFCC
mindspore::dataset::Execute trans({mfcc_op});
Status status = trans(input_ms, &input_ms);
EXPECT_TRUE(status.IsOk());
}

View File

@ -0,0 +1,281 @@
# Copyright 2022 Huawei Technologies Co., Ltd :
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Testing MFCC Python API
"""
import numpy as np
import mindspore.dataset as ds
import mindspore.dataset.audio as audio
from mindspore import log as logger
from mindspore.dataset.audio.utils import WindowType, BorderType, MelType, NormType, NormMode
def count_unequal_element(data_expected, data_me, rtol, atol):
""" Precision calculation func """
assert data_expected.shape == data_me.shape
total_count = len(data_expected.flatten())
error = np.abs(data_expected - data_me)
greater = np.greater(error, atol + np.abs(data_expected) * rtol)
loss_count = np.count_nonzero(greater)
assert (loss_count / total_count) < rtol, "\ndata_expected_std:{0}\ndata_me_error:{1}\nloss:{2}".format(
data_expected[greater], data_me[greater], error[greater])
def test_mfcc_pipeline():
"""
Feature: Mindspore pipeline mode normal testcase: mfcc op
Description: Input audio signal to test pipeline
Expectation: Generate expected output after cases were executed
"""
logger.info("test_mfcc_pipeline")
wav = [[[1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5]]]
dataset = ds.NumpySlicesDataset(wav, column_names=["audio"], shuffle=False)
out = audio.MFCC(sample_rate=16000, n_mfcc=4, dct_type=2, norm=NormMode.ORTHO, log_mels=True,
melkwargs={"n_fft": 16, "win_length": 16, "hop_length": 8, "f_min": 0.0,
"f_max": 10000.0, "pad": 0, "n_mels": 5, "window": WindowType.HANN, "power": 2.0,
"normalized": False, "center": True, "pad_mode": BorderType.REFLECT, "onesided": True,
"norm": NormType.NONE, "mel_scale": MelType.HTK})
dataset = dataset.map(operations=out, input_columns=["audio"], output_columns=["MFCC"])
result = np.array([[[2.7625, 5.6919, 3.6229, 3.9756],
[0.8142, 3.2698, 1.4946, 3.0683],
[-1.6855, -0.8312, -1.1395, 0.0481],
[-2.1808, -2.5489, -2.3110, -3.1485]]])
for data1 in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
count_unequal_element(data1["MFCC"], result, 0.0001, 0.0001)
def test_mfcc_eager():
"""
Feature: Mindspore eager mode normal testcase: mfcc op
Description: Input audio signal to test eager
Expectation: Generate expected output after cases were executed
"""
logger.info("test_mfcc_eager")
wav = np.array([[[1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5]]])
out = audio.MFCC(sample_rate=16000, n_mfcc=4, dct_type=2, norm=NormMode.ORTHO, log_mels=True,
melkwargs={"n_fft": 16, "win_length": 16, "hop_length": 8, "f_min": 0.0, "f_max": 10000.0,
"pad": 0, "n_mels": 5, "window": WindowType.HANN, "power": 2.0, "normalized": False,
"center": True, "pad_mode": BorderType.REFLECT, "onesided": True,
"norm": NormType.NONE, "mel_scale": MelType.HTK})(wav)
result = np.array([[[[2.7625, 5.6919, 3.6229, 3.9756],
[0.8142, 3.2698, 1.4946, 3.0683],
[-1.6855, -0.8312, -1.1395, 0.0481],
[-2.1808, -2.5489, -2.3110, -3.1485]]]])
count_unequal_element(out, result, 0.0001, 0.0001)
def test_mfcc_param():
"""
Feature: Test mfcc invalid parameter.
Description: Test some invalid parameters.
Expectation: throw ValueError, TypeError or RuntimeError exception.
"""
try:
_ = audio.MFCC(sample_rate=-1)
except ValueError as error:
logger.info("Got an exception in MFCC: {}".format(str(error)))
assert "Input sample_rate is not within the required interval of [0, 2147483647]." in str(error)
try:
_ = audio.MFCC(log_mels=-1)
except TypeError as error:
logger.info("Got an exception in MFCC: {}".format(str(error)))
assert "Argument log_mels with value -1 is not of type [<class 'bool'>], but got <class 'int'>." in str(error)
try:
_ = audio.MFCC(norm="Karl Marx")
except TypeError as error:
logger.info("Got an exception in MFCC: {}".format(str(error)))
assert "Argument norm with value Karl Marx is not of type [<enum 'NormMode'>], but got <class 'str'>." \
in str(error)
try:
_ = audio.MFCC(dct_type=-1)
except ValueError as error:
logger.info("Got an exception in MFCC: {}".format(str(error)))
assert "dct_type must be 2, but got : -1." in str(error)
try:
_ = audio.MFCC(sample_rate=-1)
except ValueError as error:
logger.info("Got an exception in MFCC: {}".format(str(error)))
assert "Input sample_rate is not within the required interval of [0, 2147483647]." in str(error)
try:
_ = audio.MFCC(sample_rate="s")
except TypeError as error:
logger.info("Got an exception in MFCC: {}".format(str(error)))
assert "Argument sample_rate with value s is not of type [<class 'int'>], but got <class 'str'>." in str(error)
try:
_ = audio.MFCC(melkwargs={"n_fft": 16, "win_length": 16, "hop_length": 8, "f_min": 0.0, "f_max": -1,
"pad": 0, "n_mels": 5, "window": WindowType.HANN, "power": 2.0, "normalized": True,
"center": True, "pad_mode": BorderType.REFLECT, "onesided": True,
"norm": NormType.NONE, "mel_scale": MelType.HTK})
except ValueError as error:
logger.info("Got an exception in MFCC: {}".format(str(error)))
assert "Input f_max is not within the required interval of (0, 16777216]." in str(error)
try:
_ = audio.MFCC(melkwargs={"n_fft": 16, "win_length": 16, "hop_length": 8, "f_min": -1, "f_max": 10000.0,
"pad": 0, "n_mels": 5, "window": WindowType.HANN, "power": 2.0, "normalized": True,
"center": True, "pad_mode": BorderType.REFLECT, "onesided": True,
"norm": NormType.NONE, "mel_scale": MelType.HTK})
except ValueError as error:
logger.info("Got an exception in MFCC: {}".format(str(error)))
assert "Input n_mels should be greater than or equal to n_mfcc, but got n_mfcc: 40 and n_mels: 5." in str(error)
try:
_ = audio.MFCC(melkwargs={"n_fft": 16, "win_length": 16, "hop_length": 8, "f_min": 0.0, "f_max": 10000.0,
"pad": 0, "n_mels": 5, "window": WindowType.HANN, "power": 2.0, "normalized": True,
"center": True, "pad_mode": BorderType.REFLECT, "onesided": True,
"norm": -1, "mel_scale": MelType.HTK})
except TypeError as error:
logger.info("Got an exception in MFCC: {}".format(str(error)))
assert "Argument norm with value -1 is not of type [<enum 'NormType'>], but got <class 'int'>." in str(error)
try:
_ = audio.MFCC(melkwargs={"n_fft": 16, "win_length": 16, "hop_length": 8, "f_min": 0.0, "f_max": 10000.0,
"pad": 0, "n_mels": 5, "window": WindowType.HANN, "power": 2.0, "normalized": True,
"center": True, "pad_mode": BorderType.REFLECT, "onesided": True,
"norm": NormType.NONE, "mel_scale": -1})
except TypeError as error:
logger.info("Got an exception in MFCC: {}".format(str(error)))
assert "Argument mel_type with value -1 is not of type [<enum 'MelType'>], but got <class 'int'>." in str(error)
try:
_ = audio.MFCC(melkwargs={"n_fft": -1, "win_length": 16, "hop_length": 8, "f_min": 0.0, "f_max": 10000.0,
"pad": 0, "n_mels": 5, "window": WindowType.HANN, "power": 2.0, "normalized": True,
"center": True, "pad_mode": BorderType.REFLECT, "onesided": True,
"norm": NormType.NONE, "mel_scale": MelType.HTK})
except ValueError as error:
logger.info("Got an exception in MFCC: {}".format(str(error)))
assert "Input n_fft is not within the required interval of [1, 2147483647]." in str(error)
try:
_ = audio.MFCC(melkwargs={"n_fft": 0, "win_length": 16, "hop_length": 8, "f_min": 0.0, "f_max": 10000.0,
"pad": 0, "n_mels": 5, "window": WindowType.HANN, "power": 2.0, "normalized": True,
"center": True, "pad_mode": BorderType.REFLECT, "onesided": True,
"norm": NormType.NONE, "mel_scale": MelType.HTK})
except ValueError as error:
logger.info("Got an exception in MFCC: {}".format(str(error)))
assert "Input n_fft is not within the required interval of [1, 2147483647]." in str(error)
try:
_ = audio.MFCC(melkwargs={"n_fft": 16, "win_length": 0, "hop_length": 8, "f_min": 0.0, "f_max": 10000.0,
"pad": 0, "n_mels": 50, "window": WindowType.HANN, "power": 2.0, "normalized": True,
"center": True, "pad_mode": BorderType.REFLECT, "onesided": True,
"norm": NormType.NONE, "mel_scale": MelType.HTK})
except ValueError as error:
logger.info("Got an exception in MFCC: {}".format(str(error)))
assert "Input win_length is not within the required interval of [0, 2147483647]." in str(error)
try:
_ = audio.MFCC(melkwargs={"n_fft": 16, "win_length": "s", "hop_length": 8, "f_min": 0.0, "f_max": 10000.0,
"pad": 0, "n_mels": 5, "window": WindowType.HANN, "power": 2.0, "normalized": True,
"center": True, "pad_mode": BorderType.REFLECT, "onesided": True,
"norm": NormType.NONE, "mel_scale": MelType.HTK})
except TypeError as error:
logger.info("Got an exception in MFCC: {}".format(str(error)))
assert "Argument win_length with value s is not of type [<class 'int'>], but got <class 'str'>." in str(error)
try:
_ = audio.MFCC(melkwargs={"n_fft": 16, "win_length": 16, "hop_length": -1, "f_min": 0.0, "f_max": 10000.0,
"pad": 0, "n_mels": 5, "window": WindowType.HANN, "power": 2.0, "normalized": True,
"center": True, "pad_mode": BorderType.REFLECT, "onesided": True,
"norm": NormType.NONE, "mel_scale": MelType.HTK})
except ValueError as error:
logger.info("Got an exception in MFCC: {}".format(str(error)))
assert "Input hop_length is not within the required interval of [1, 2147483647]." in str(error)
try:
_ = audio.MFCC(melkwargs={"n_fft": 200, "win_length": 300, "hop_length": 8, "f_min": 0.0, "f_max": 10000.0,
"pad": 0, "n_mels": 50, "window": WindowType.HANN, "power": 2.0, "normalized": True,
"center": True, "pad_mode": BorderType.REFLECT, "onesided": True,
"norm": NormType.NONE, "mel_scale": MelType.HTK})
except ValueError as error:
logger.info("Got an exception in MFCC: {}".format(str(error)))
assert "Input win_length should be no more than n_fft, but got win_length: 300 and n_fft: 200." \
in str(error)
try:
_ = audio.MFCC(melkwargs={"n_fft": 16, "win_length": 16, "hop_length": 8, "f_min": 0.0, "f_max": 10000.0,
"pad": -1, "n_mels": 5, "window": WindowType.HANN, "power": 2.0, "normalized": True,
"center": True, "pad_mode": BorderType.REFLECT, "onesided": True,
"norm": NormType.NONE, "mel_scale": MelType.HTK})
except ValueError as error:
logger.info("Got an exception in MFCC: {}".format(str(error)))
assert "Input pad is not within the required interval of [0, 2147483647]." in str(error)
try:
_ = audio.MFCC(melkwargs={"n_fft": 16, "win_length": 16, "hop_length": 8, "f_min": 0.0, "f_max": 10000.0,
"pad": 0, "n_mels": 5, "window": WindowType.HANN, "power": -1, "normalized": True,
"center": True, "pad_mode": BorderType.REFLECT, "onesided": True,
"norm": NormType.NONE, "mel_scale": MelType.HTK})
except ValueError as error:
logger.info("Got an exception in MFCC: {}".format(str(error)))
assert "Input power is not within the required interval of [0, 16777216]." in str(error)
try:
_ = audio.MFCC(melkwargs={"n_fft": "XiaDanni", "win_length": 16, "hop_length": 8, "f_min": 0.0,
"f_max": 10000.0, "pad": 0, "n_mels": 5, "window": WindowType.HANN, "power": 2.0,
"normalized": True, "center": True, "pad_mode": BorderType.REFLECT,
"onesided": True, "norm": NormType.NONE, "mel_scale": MelType.HTK})
except TypeError as error:
logger.info("Got an exception in MFCC: {}".format(str(error)))
assert "Argument n_fft with value XiaDanni is not of type [<class 'int'>], but got <class 'str'>." \
in str(error)
try:
_ = audio.MFCC(melkwargs={"n_fft": 16, "win_length": 16, "hop_length": 8, "f_min": 0.0, "f_max": 10000.0,
"pad": 0, "n_mels": 5, "window": False, "power": 2.0, "normalized": True,
"center": True, "pad_mode": BorderType.REFLECT, "onesided": True,
"norm": NormType.NONE, "mel_scale": MelType.HTK})
except TypeError as error:
logger.info("Got an exception in MFCC: {}".format(str(error)))
assert "Argument window with value False is not of type [<enum 'WindowType'>], but got <class 'bool'>." \
in str(error)
try:
_ = audio.MFCC(melkwargs={"n_fft": 16, "win_length": 16, "hop_length": 8, "f_min": 0.0, "f_max": 10000.0,
"pad": 0, "n_mels": 5, "window": WindowType.HANN, "power": 2.0, "normalized": True,
"center": True, "pad_mode": False, "onesided": True, "norm": NormType.NONE,
"mel_scale": MelType.HTK})
except TypeError as error:
logger.info("Got an exception in MFCC: {}".format(str(error)))
assert "Argument pad_mode with value False is not of type [<enum 'BorderType'>], but got <class 'bool'>." \
in str(error)
try:
_ = audio.MFCC(melkwargs={"n_fft": 16, "win_length": 16, "hop_length": 8, "f_min": 0.0, "f_max": 10000.0,
"pad": 0, "n_mels": 5, "window": WindowType.HANN, "power": 2.0, "normalized": True,
"center": True, "pad_mode": BorderType.REFLECT, "onesided": "LianLinghang",
"norm": NormType.NONE, "mel_scale": MelType.HTK})
except TypeError as error:
logger.info("Got an exception in MFCC: {}".format(str(error)))
assert "Argument onesided with value LianLinghang is not of type [<class 'bool'>], but got <class 'str'>." \
in str(error)
try:
_ = audio.MFCC(melkwargs={"n_fft": 16, "win_length": 16, "hop_length": 8, "f_min": 0.0, "f_max": 10000.0,
"pad": 0, "n_mels": 5, "window": WindowType.HANN, "power": 2.0, "normalized": True,
"center": "XiaDanni", "pad_mode": BorderType.REFLECT, "onesided": False,
"norm": NormType.NONE, "mel_scale": MelType.HTK})
except TypeError as error:
logger.info("Got an exception in MFCC: {}".format(str(error)))
assert "Argument center with value XiaDanni is not of type [<class 'bool'>], but got <class 'str'>." \
in str(error)
try:
_ = audio.MFCC(melkwargs={"n_fft": 16, "win_length": 16, "hop_length": 8, "f_min": 0.0, "f_max": 10000.0,
"pad": 0, "n_mels": 5, "window": WindowType.HANN, "power": 2.0, "normalized": "s",
"center": True, "pad_mode": BorderType.REFLECT, "onesided": False,
"norm": NormType.NONE, "mel_scale": MelType.HTK})
except TypeError as error:
logger.info("Got an exception in MFCC: {}".format(str(error)))
assert "Argument normalized with value s is not of type [<class 'bool'>], but got <class 'str'>." in str(error)
try:
_ = audio.MFCC(melkwargs={"n_fft": 16, "win_length": 16, "hop_length": 8, "f_min": 0.0, "f_max": 10000.0,
"pad": 0, "n_mels": 5, "window": WindowType.HANN, "power": 2.0, "normalized": 1,
"center": True, "pad_mode": BorderType.REFLECT, "onesided": "LianLinghang",
"norm": NormType.NONE, "mel_scale": MelType.HTK})
except TypeError as error:
logger.info("Got an exception in MFCC: {}".format(str(error)))
assert "Argument normalized with value 1 is not of type [<class 'bool'>], but got <class 'int'>." in str(error)
if __name__ == '__main__':
test_mfcc_pipeline()
test_mfcc_eager()
test_mfcc_param()