forked from mindspore-Ecosystem/mindspore
[assistant][ops][I5EWI6] Add new data operator MFCC.
This commit is contained in:
parent
d870c9090c
commit
ef952d9977
|
@ -0,0 +1,27 @@
|
|||
mindspore.dataset.audio.MFCC
|
||||
============================
|
||||
|
||||
.. py:class:: mindspore.dataset.audio.MFCC(sample_rate=16000, n_mfcc=40, dct_type=2, norm=NormMode.ORTHO, log_mels=False, melkwargs=None)
|
||||
|
||||
计算音频信号的梅尔频率倒谱系数。
|
||||
|
||||
参数:
|
||||
- **sample_rate** (int, 可选) - 采样频率(单位:Hz),不能小于零。默认值:16000。
|
||||
- **n_mfcc** (int, 可选) - 要保留的梅尔频率倒谱系数数,不能小于零。默认:40。
|
||||
- **dct_type** (int, 可选) - 要使用的离散余弦变换类型(离散余弦变换),只能为2。默认:2。
|
||||
- **norm** (NormMode, 可选) - 要使用的标准类型。默认:NormMode.ORTHO。
|
||||
- **log_mels** (bool, 可选) - 是否使用梅尔对数谱图而不是分贝刻度。默认:False。
|
||||
- **melkwargs** (dict, 可选) - 梅尔频谱的参数,如果为None则使用默认参数。默认:None,会被设置为
|
||||
`{'n_fft': 400, 'win_length': n_fft, 'hop_length': win_length // 2, 'f_min' : 0.0, 'f_max' : sample_rate // 2,
|
||||
'pad': 0, 'window': WindowType.HANN, 'power': 2.0, 'normalized': False, 'center': True, 'pad_mode': BorderType.REFLECT,
|
||||
'onesided': True, 'norm' : NormType.NONE, 'mel_scale' : MelType.HTK}` 。
|
||||
|
||||
异常:
|
||||
- **TypeError** - 如果 `sample_rate` 的类型不为int。
|
||||
- **TypeError** - 如果 `log_mels` 的类型不为bool。
|
||||
- **TypeError** - 如果 `norm` 的类型不为 :class:`mindspore.dataset.audio.utils.NormMode` 。
|
||||
- **TypeError** - 如果 `n_mfcc` 的类型不为int。
|
||||
- **TypeError** - 如果 `melkwargs` 的类型不为dict。
|
||||
- **ValueError** - 如果 `sample_rate` 为负数。
|
||||
- **ValueError** - 如果 `n_mfcc` 为负数。
|
||||
- **ValueError** - 如果 `dct_type` 不为2。
|
|
@ -388,6 +388,7 @@ API样例中常用的导入模块如下:
|
|||
mindspore.dataset.audio.MaskAlongAxis
|
||||
mindspore.dataset.audio.MaskAlongAxisIID
|
||||
mindspore.dataset.audio.MelScale
|
||||
mindspore.dataset.audio.MFCC
|
||||
mindspore.dataset.audio.MuLawDecoding
|
||||
mindspore.dataset.audio.MuLawEncoding
|
||||
mindspore.dataset.audio.Overdrive
|
||||
|
|
|
@ -245,6 +245,7 @@ Transforms
|
|||
mindspore.dataset.audio.MaskAlongAxis
|
||||
mindspore.dataset.audio.MaskAlongAxisIID
|
||||
mindspore.dataset.audio.MelScale
|
||||
mindspore.dataset.audio.MFCC
|
||||
mindspore.dataset.audio.MuLawDecoding
|
||||
mindspore.dataset.audio.MuLawEncoding
|
||||
mindspore.dataset.audio.Overdrive
|
||||
|
|
|
@ -48,6 +48,7 @@
|
|||
#include "minddata/dataset/audio/ir/kernels/mask_along_axis_iid_ir.h"
|
||||
#include "minddata/dataset/audio/ir/kernels/mask_along_axis_ir.h"
|
||||
#include "minddata/dataset/audio/ir/kernels/mel_scale_ir.h"
|
||||
#include "minddata/dataset/audio/ir/kernels/mfcc_ir.h"
|
||||
#include "minddata/dataset/audio/ir/kernels/mu_law_decoding_ir.h"
|
||||
#include "minddata/dataset/audio/ir/kernels/mu_law_encoding_ir.h"
|
||||
#include "minddata/dataset/audio/ir/kernels/overdrive_ir.h"
|
||||
|
@ -710,6 +711,71 @@ Status MelscaleFbanks(MSTensor *output, int32_t n_freqs, float f_min, float f_ma
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
// MFCC Transform Operation.
|
||||
struct MFCC::Data {
|
||||
Data(int32_t sample_rate, int32_t n_mfcc, int32_t dct_type, NormMode norm, bool log_mels, int32_t n_fft,
|
||||
int32_t win_length, int32_t hop_length, float f_min, float f_max, int32_t pad, int32_t n_mels, WindowType window,
|
||||
float power, bool normalized, bool center, BorderType pad_mode, bool onesided, NormType norm_mel,
|
||||
MelType mel_scale)
|
||||
: sample_rate_(sample_rate),
|
||||
n_mfcc_(n_mfcc),
|
||||
dct_type_(dct_type),
|
||||
norm_(norm),
|
||||
log_mels_(log_mels),
|
||||
n_fft_(n_fft),
|
||||
win_length_(win_length),
|
||||
hop_length_(hop_length),
|
||||
f_min_(f_min),
|
||||
f_max_(f_max),
|
||||
pad_(pad),
|
||||
n_mels_(n_mels),
|
||||
window_(window),
|
||||
power_(power),
|
||||
normalized_(normalized),
|
||||
center_(center),
|
||||
pad_mode_(pad_mode),
|
||||
onesided_(onesided),
|
||||
norm_mel_(norm_mel),
|
||||
mel_scale_(mel_scale) {}
|
||||
int32_t sample_rate_;
|
||||
int32_t n_mfcc_;
|
||||
int32_t dct_type_;
|
||||
NormMode norm_;
|
||||
bool log_mels_;
|
||||
int32_t n_fft_;
|
||||
int32_t win_length_;
|
||||
int32_t hop_length_;
|
||||
float f_min_;
|
||||
float f_max_;
|
||||
int32_t pad_;
|
||||
int32_t n_mels_;
|
||||
WindowType window_;
|
||||
float power_;
|
||||
bool normalized_;
|
||||
bool center_;
|
||||
BorderType pad_mode_;
|
||||
bool onesided_;
|
||||
NormType norm_mel_;
|
||||
MelType mel_scale_;
|
||||
std::map<std::string, std::string> melkwargs_;
|
||||
};
|
||||
|
||||
MFCC::MFCC(int32_t sample_rate, int32_t n_mfcc, int32_t dct_type, NormMode norm, bool log_mels, int32_t n_fft,
|
||||
int32_t win_length, int32_t hop_length, float f_min, float f_max, int32_t pad, int32_t n_mels,
|
||||
WindowType window, float power, bool normalized, bool center, BorderType pad_mode, bool onesided,
|
||||
NormType norm_mel, MelType mel_scale)
|
||||
: data_(std::make_shared<Data>(sample_rate, n_mfcc, dct_type, norm, log_mels, n_fft, win_length, hop_length, f_min,
|
||||
f_max, pad, n_mels, window, power, normalized, center, pad_mode, onesided, norm_mel,
|
||||
mel_scale)) {}
|
||||
|
||||
std::shared_ptr<TensorOperation> MFCC::Parse() {
|
||||
return std::make_shared<MFCCOperation>(data_->sample_rate_, data_->n_mfcc_, data_->dct_type_, data_->norm_,
|
||||
data_->log_mels_, data_->n_fft_, data_->win_length_, data_->hop_length_,
|
||||
data_->f_min_, data_->f_max_, data_->pad_, data_->n_mels_, data_->window_,
|
||||
data_->power_, data_->normalized_, data_->center_, data_->pad_mode_,
|
||||
data_->onesided_, data_->norm_mel_, data_->mel_scale_);
|
||||
}
|
||||
|
||||
// MuLawDecoding Transform Operation.
|
||||
struct MuLawDecoding::Data {
|
||||
explicit Data(int32_t quantization_channels) : quantization_channels_(quantization_channels) {}
|
||||
|
|
|
@ -52,6 +52,7 @@
|
|||
#include "minddata/dataset/audio/ir/kernels/mask_along_axis_iid_ir.h"
|
||||
#include "minddata/dataset/audio/ir/kernels/mask_along_axis_ir.h"
|
||||
#include "minddata/dataset/audio/ir/kernels/mel_scale_ir.h"
|
||||
#include "minddata/dataset/audio/ir/kernels/mfcc_ir.h"
|
||||
#include "minddata/dataset/audio/ir/kernels/mu_law_decoding_ir.h"
|
||||
#include "minddata/dataset/audio/ir/kernels/mu_law_encoding_ir.h"
|
||||
#include "minddata/dataset/audio/ir/kernels/overdrive_ir.h"
|
||||
|
@ -479,6 +480,31 @@ PYBIND_REGISTER(MelScaleOperation, 1, ([](const py::module *m) {
|
|||
}));
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(MFCCOperation, 1, ([](const py::module *m) {
|
||||
(void)py::class_<audio::MFCCOperation, TensorOperation, std::shared_ptr<audio::MFCCOperation>>(
|
||||
*m, "MFCCOperation")
|
||||
.def(py::init([](int32_t sample_rate, int32_t n_mfcc, int32_t dct_type, NormMode norm,
|
||||
bool log_mels, const py::dict &melkwargs, WindowType window, BorderType pad_mode,
|
||||
NormType norm_mel, MelType mel_scale) {
|
||||
int32_t n_fft = py::cast<int>(melkwargs["n_fft"]);
|
||||
int32_t win_length = py::cast<int>(melkwargs["win_length"]);
|
||||
int32_t hop_length = py::cast<int>(melkwargs["hop_length"]);
|
||||
float f_min = py::cast<float>(melkwargs["f_min"]);
|
||||
float f_max = py::cast<float>(melkwargs["f_max"]);
|
||||
int32_t pad = py::cast<int>(melkwargs["pad"]);
|
||||
int32_t n_mels = py::cast<int>(melkwargs["n_mels"]);
|
||||
float power = py::cast<float>(melkwargs["power"]);
|
||||
bool normalized = py::cast<bool>(melkwargs["normalized"]);
|
||||
bool center = py::cast<bool>(melkwargs["center"]);
|
||||
bool onesided = py::cast<bool>(melkwargs["onesided"]);
|
||||
auto mfcc = std::make_shared<audio::MFCCOperation>(
|
||||
sample_rate, n_mfcc, dct_type, norm, log_mels, n_fft, win_length, hop_length, f_min, f_max, pad,
|
||||
n_mels, window, power, normalized, center, pad_mode, onesided, norm_mel, mel_scale);
|
||||
THROW_IF_ERROR(mfcc->ValidateParams());
|
||||
return mfcc;
|
||||
}));
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(
|
||||
MuLawDecodingOperation, 1, ([](const py::module *m) {
|
||||
(void)py::class_<audio::MuLawDecodingOperation, TensorOperation, std::shared_ptr<audio::MuLawDecodingOperation>>(
|
||||
|
|
|
@ -34,6 +34,7 @@ add_library(audio-ir-kernels OBJECT
|
|||
mask_along_axis_iid_ir.cc
|
||||
mask_along_axis_ir.cc
|
||||
mel_scale_ir.cc
|
||||
mfcc_ir.cc
|
||||
mu_law_decoding_ir.cc
|
||||
mu_law_encoding_ir.cc
|
||||
overdrive_ir.cc
|
||||
|
|
|
@ -0,0 +1,127 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "minddata/dataset/audio/ir/kernels/mfcc_ir.h"
|
||||
|
||||
#include "minddata/dataset/audio/ir/validators.h"
|
||||
#include "minddata/dataset/audio/kernels/audio_utils.h"
|
||||
#include "minddata/dataset/audio/kernels/mfcc_op.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
namespace audio {
|
||||
MFCCOperation::MFCCOperation(int32_t sample_rate, int32_t n_mfcc, int32_t dct_type, NormMode norm, bool log_mels,
|
||||
int32_t n_fft, int32_t win_length, int32_t hop_length, float f_min, float f_max,
|
||||
int32_t pad, int32_t n_mels, WindowType window, float power, bool normalized, bool center,
|
||||
BorderType pad_mode, bool onesided, NormType norm_mel, MelType mel_scale)
|
||||
: sample_rate_(sample_rate),
|
||||
n_mfcc_(n_mfcc),
|
||||
dct_type_(dct_type),
|
||||
norm_(norm),
|
||||
log_mels_(log_mels),
|
||||
n_fft_(n_fft),
|
||||
win_length_(win_length),
|
||||
hop_length_(hop_length),
|
||||
f_min_(f_min),
|
||||
f_max_(f_max),
|
||||
pad_(pad),
|
||||
n_mels_(n_mels),
|
||||
window_(window),
|
||||
power_(power),
|
||||
normalized_(normalized),
|
||||
center_(center),
|
||||
pad_mode_(pad_mode),
|
||||
onesided_(onesided),
|
||||
norm_mel_(norm_mel),
|
||||
mel_scale_(mel_scale) {}
|
||||
|
||||
MFCCOperation::~MFCCOperation() = default;
|
||||
|
||||
std::string MFCCOperation::Name() const { return kMFCCOperation; }
|
||||
|
||||
Status MFCCOperation::ValidateParams() {
|
||||
RETURN_IF_NOT_OK(ValidateIntScalarNonNegative("MFCC", "sample_rate", sample_rate_));
|
||||
RETURN_IF_NOT_OK(ValidateIntScalarNonNegative("MFCC", "n_mfcc", n_mfcc_));
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(dct_type_ == TWO,
|
||||
"MFCC: dct_type must be equal to 2, but got: " + std::to_string(dct_type_));
|
||||
RETURN_IF_NOT_OK(ValidateFloatScalarNonNegative("MFCC", "f_max", f_max_));
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(n_mfcc_ <= n_mels_,
|
||||
"MFCC: n_mels should be greater than or equal to n_mfcc, but got n_mfcc: " +
|
||||
std::to_string(n_mfcc_) + " and n_mels: " + std::to_string(n_mels_));
|
||||
// MelSpectrogram params
|
||||
RETURN_IF_NOT_OK(ValidateIntScalarNonNegative("MFCC", "n_mels", n_mels_));
|
||||
RETURN_IF_NOT_OK(ValidateIntScalarPositive("MFCC", "n_fft", n_fft_));
|
||||
RETURN_IF_NOT_OK(ValidateIntScalarNonNegative("MFCC", "win_length", win_length_));
|
||||
RETURN_IF_NOT_OK(ValidateIntScalarNonNegative("MFCC", "hop_length", hop_length_));
|
||||
RETURN_IF_NOT_OK(ValidateIntScalarNonNegative("MFCC", "pad", pad_));
|
||||
RETURN_IF_NOT_OK(ValidateIntScalarPositive("MFCC", "power", power_));
|
||||
RETURN_IF_NOT_OK(ValidateIntScalarNonNegative("MFCC", "n_mels", n_mels_));
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(pad_mode_ != BorderType::kEdge, "MFCC: invalid BorderType, kEdge is not supported.");
|
||||
if (f_max_ != 0) {
|
||||
RETURN_IF_NOT_OK(ValidateFloatScalarNonNegative("MFCC", "f_max", f_max_));
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(f_min_ <= f_max_,
|
||||
"MFCC: f_max must be greater than or equal to f_min, but got "
|
||||
"f_max: " +
|
||||
std::to_string(f_max_) + " and f_min: " + std::to_string(f_min_));
|
||||
} else {
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(f_min_ < (sample_rate_ * HALF),
|
||||
"MFCC: f_min must be less than half of sample_rate when f_max is 0, but got"
|
||||
" f_min: " +
|
||||
std::to_string(f_min_));
|
||||
}
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(win_length_ <= n_fft_,
|
||||
"MFCC: win_length must be less than or equal to n_fft, but got win_length: " +
|
||||
std::to_string(win_length_) + ", n_fft: " + std::to_string(n_fft_));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
std::shared_ptr<TensorOp> MFCCOperation::Build() {
|
||||
win_length_ = win_length_ == 0 ? n_fft_ : win_length_;
|
||||
hop_length_ = hop_length_ == 0 ? (win_length_ / TWO) : hop_length_;
|
||||
f_max_ = f_max_ == 0 ? (sample_rate_ / TWO) : f_max_;
|
||||
std::shared_ptr<MFCCOp> tensor_op = std::make_shared<MFCCOp>(
|
||||
sample_rate_, n_mfcc_, dct_type_, log_mels_, n_fft_, win_length_, hop_length_, f_min_, f_max_, pad_, n_mels_,
|
||||
window_, power_, normalized_, center_, pad_mode_, onesided_, norm_mel_, norm_, mel_scale_);
|
||||
return tensor_op;
|
||||
}
|
||||
|
||||
Status MFCCOperation::to_json(nlohmann::json *out_json) {
|
||||
nlohmann::json args;
|
||||
args["sample_rate"] = sample_rate_;
|
||||
args["n_mfcc"] = n_mfcc_;
|
||||
args["dct_type"] = dct_type_;
|
||||
args["norm"] = norm_;
|
||||
args["log_mels"] = log_mels_;
|
||||
args["n_fft"] = n_fft_;
|
||||
args["win_length"] = win_length_;
|
||||
args["hop_length"] = hop_length_;
|
||||
args["f_min"] = f_min_;
|
||||
args["f_max"] = f_max_;
|
||||
args["pad"] = pad_;
|
||||
args["n_mels"] = n_mels_;
|
||||
args["window"] = window_;
|
||||
args["power"] = power_;
|
||||
args["normalized"] = normalized_;
|
||||
args["center"] = center_;
|
||||
args["pad_mode"] = pad_mode_;
|
||||
args["onesided"] = onesided_;
|
||||
args["norm_mel"] = norm_mel_;
|
||||
args["mel_scale"] = mel_scale_;
|
||||
*out_json = args;
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace audio
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,96 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_IR_KERNELS_MFCC_IR_H_
|
||||
#define MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_IR_KERNELS_MFCC_IR_H_
|
||||
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "include/api/status.h"
|
||||
#include "minddata/dataset/include/dataset/constants.h"
|
||||
#include "minddata/dataset/kernels/ir/tensor_operation.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
namespace audio {
|
||||
constexpr char kMFCCOperation[] = "MFCC";
|
||||
|
||||
class MFCCOperation : public TensorOperation {
|
||||
public:
|
||||
/// \brief Constructor.
|
||||
/// \param[in] sample_rate Sample rate of audio signal.
|
||||
/// \param[in] n_mfcc Number of mfc coefficients to retain.
|
||||
/// \param[in] dct_type Type of DCT (discrete cosine transform) to use.
|
||||
/// \param[in] log_mels Whether to use log-mel spectrograms instead of db-scaled.
|
||||
/// \param[in] n_fft Size of FFT, creates n_fft // 2 + 1 bins.
|
||||
/// \param[in] win_length Window size.
|
||||
/// \param[in] hop_length Length of hop between STFT windows.
|
||||
/// \param[in] f_min Minimum frequency.
|
||||
/// \param[in] f_max Maximum frequency.
|
||||
/// \param[in] pad Two sided padding of signal.
|
||||
/// \param[in] n_mels Number of mel filterbanks.
|
||||
/// \param[in] window A function to create a window tensor that is applied/multiplied to each frame/window.
|
||||
/// \param[in] power Exponent for the magnitude spectrogram, (must be > 0) e.g., 1 for energy, 2 for power, etc.
|
||||
/// \param[in] normalized Whether to normalize by magnitude after stft.
|
||||
/// \param[in] center Whether to pad waveform on both sides.
|
||||
/// \param[in] pad_mode Controls the padding method used when center is True.
|
||||
/// \param[in] onesided Controls whether to return half of results to avoid redundancy.
|
||||
/// \param[in] norm_mel Norm to use.
|
||||
/// \param[in] norm If 'slaney', divide the triangular mel weights by the width of the mel band (area normalization).
|
||||
/// \param[in] mel_scale Scale to use: htk or slaney.
|
||||
MFCCOperation(int32_t sample_rate, int32_t n_mfcc, int32_t dct_type, NormMode norm, bool log_mels, int32_t n_fft,
|
||||
int32_t win_length, int32_t hop_length, float f_min, float f_max, int32_t pad, int32_t n_mels,
|
||||
WindowType window, float power, bool normalized, bool center, BorderType pad_mode, bool onesided,
|
||||
NormType norm_mel, MelType mel_scale);
|
||||
|
||||
~MFCCOperation();
|
||||
|
||||
std::shared_ptr<TensorOp> Build() override;
|
||||
|
||||
Status ValidateParams() override;
|
||||
|
||||
std::string Name() const override;
|
||||
|
||||
Status to_json(nlohmann::json *out_json) override;
|
||||
|
||||
private:
|
||||
int32_t sample_rate_;
|
||||
int32_t n_mfcc_;
|
||||
int32_t dct_type_;
|
||||
NormMode norm_;
|
||||
bool log_mels_;
|
||||
int32_t n_fft_;
|
||||
int32_t win_length_;
|
||||
int32_t hop_length_;
|
||||
float f_min_;
|
||||
float f_max_;
|
||||
int32_t pad_;
|
||||
int32_t n_mels_;
|
||||
WindowType window_;
|
||||
float power_;
|
||||
bool normalized_;
|
||||
bool center_;
|
||||
BorderType pad_mode_;
|
||||
bool onesided_;
|
||||
NormType norm_mel_;
|
||||
MelType mel_scale_;
|
||||
};
|
||||
} // namespace audio
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_IR_KERNELS_MFCC_IR_H_
|
|
@ -35,6 +35,7 @@ add_library(audio-kernels OBJECT
|
|||
mask_along_axis_iid_op.cc
|
||||
mask_along_axis_op.cc
|
||||
mel_scale_op.cc
|
||||
mfcc_op.cc
|
||||
mu_law_decoding_op.cc
|
||||
mu_law_encoding_op.cc
|
||||
overdrive_op.cc
|
||||
|
|
|
@ -2230,5 +2230,77 @@ Status LFCC(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *outpu
|
|||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status MelSpectrogram(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate,
|
||||
int32_t n_fft, int32_t win_length, int32_t hop_length, float f_min, float f_max, int32_t pad,
|
||||
int32_t n_mels, WindowType window, float power, bool normalized, bool center, BorderType pad_mode,
|
||||
bool onesided, NormType norm, MelType mel_scale) {
|
||||
auto input_shape_vec = input->shape().AsVector();
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(n_fft < TWO * input_shape_vec[input_shape_vec.size() - 1],
|
||||
"MelSpectrogram: Padding size should be less than the corresponding input dimension.");
|
||||
RETURN_UNEXPECTED_IF_NULL(input);
|
||||
RETURN_UNEXPECTED_IF_NULL(output);
|
||||
std::shared_ptr<Tensor> spectrogram;
|
||||
RETURN_IF_NOT_OK(Spectrogram(input, &spectrogram, pad, window, n_fft, hop_length, win_length, power, normalized,
|
||||
center, pad_mode, onesided));
|
||||
RETURN_IF_NOT_OK(
|
||||
MelScale<float>(spectrogram, output, n_mels, sample_rate, f_min, f_max, n_fft / TWO + 1, norm, mel_scale));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status MFCC(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate, int32_t n_mfcc,
|
||||
int32_t dct_type, bool log_mels, int32_t n_fft, int32_t win_length, int32_t hop_length, float f_min,
|
||||
float f_max, int32_t pad, int32_t n_mels, WindowType window, float power, bool normalized, bool center,
|
||||
BorderType pad_mode, bool onesided, NormType norm, NormMode norm_M, MelType mel_scale) {
|
||||
RETURN_UNEXPECTED_IF_NULL(input);
|
||||
RETURN_UNEXPECTED_IF_NULL(output);
|
||||
std::shared_ptr<Tensor> mel_spectrogram;
|
||||
std::shared_ptr<Tensor> dct_mat;
|
||||
RETURN_IF_NOT_OK(MelSpectrogram(input, &mel_spectrogram, sample_rate, n_fft, win_length, hop_length, f_min, f_max,
|
||||
pad, n_mels, window, power, normalized, center, pad_mode, onesided, norm, mel_scale));
|
||||
RETURN_IF_NOT_OK(Dct(&dct_mat, n_mfcc, n_mels, norm_M));
|
||||
if (log_mels) {
|
||||
for (auto itr = mel_spectrogram->begin<float>(); itr != mel_spectrogram->end<float>(); ++itr) {
|
||||
float log_offset = 1e-6;
|
||||
*itr = log(*itr + log_offset);
|
||||
}
|
||||
} else {
|
||||
std::shared_ptr<Tensor> amplitude_to_db;
|
||||
float multiplier = 10.0;
|
||||
float db_multiplier = 0.0;
|
||||
float amin = 1e-10;
|
||||
float top_db = 80.0;
|
||||
RETURN_IF_NOT_OK(AmplitudeToDB(mel_spectrogram, &litude_to_db, multiplier, amin, db_multiplier, top_db));
|
||||
mel_spectrogram = amplitude_to_db;
|
||||
}
|
||||
auto dct_mat_ptr = &*dct_mat->begin<float>();
|
||||
Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> mat_res;
|
||||
Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>> matrix_dm(dct_mat_ptr, n_mfcc, n_mels);
|
||||
TensorShape st_shape = mel_spectrogram->shape();
|
||||
TensorShape st_reshape({mel_spectrogram->Size() / st_shape[-1] / st_shape[-2], st_shape[-2], st_shape[-1]});
|
||||
RETURN_IF_NOT_OK(mel_spectrogram->Reshape(st_reshape));
|
||||
|
||||
const dsize_t kRowIndex = 1;
|
||||
const dsize_t kColIndex = 2;
|
||||
int rows = st_reshape[kRowIndex];
|
||||
int cols = st_reshape[kColIndex];
|
||||
std::vector<float> out_temp;
|
||||
|
||||
for (int c = 0; c < st_reshape[0]; c++) {
|
||||
Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>> matrix_c(
|
||||
&*mel_spectrogram->begin<float>() + rows * cols * c, cols, rows);
|
||||
mat_res.noalias() = (matrix_c * matrix_dm.transpose());
|
||||
std::vector<float> vec_c(mat_res.data(), mat_res.data() + mat_res.size());
|
||||
out_temp.insert(out_temp.end(), vec_c.begin(), vec_c.end());
|
||||
}
|
||||
// unpack
|
||||
std::vector<int64_t> output_shape_vec = st_shape.AsVector();
|
||||
output_shape_vec[st_shape.Size() - 1] = cols;
|
||||
output_shape_vec[st_shape.Size() - TWO] = n_mfcc;
|
||||
TensorShape output_shape(output_shape_vec);
|
||||
RETURN_IF_NOT_OK(Tensor::CreateFromVector(out_temp, output_shape, output));
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -38,6 +38,7 @@ constexpr double PI = 3.141592653589793;
|
|||
constexpr int kMinAudioDim = 1;
|
||||
constexpr int kDefaultAudioDim = 2;
|
||||
constexpr int TWO = 2;
|
||||
constexpr float HALF = 0.5;
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
@ -2140,6 +2141,60 @@ Status LFCC(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *outpu
|
|||
int32_t n_filter, int32_t n_lfcc, int32_t dct_type, bool log_lf, int32_t n_fft, int32_t win_length,
|
||||
int32_t hop_length, float f_min, float f_max, int32_t pad, WindowType window, float power, bool normalized,
|
||||
bool center, BorderType pad_mode, bool onesided, NormMode norm);
|
||||
|
||||
/// \brief Create MelSpectrogram for a raw audio signal.
|
||||
/// \param[in] input Input tensor.
|
||||
/// \param[out] output Output tensor.
|
||||
/// \param[in] sample_rate Sample rate of audio signal.
|
||||
/// \param[in] n_fft Size of FFT, creates n_fft // 2 + 1 bins.
|
||||
/// \param[in] win_length Window size.
|
||||
/// \param[in] hop_length Length of hop between STFT windows.
|
||||
/// \param[in] f_min Minimum frequency, which must be non negative.
|
||||
/// \param[in] f_max Maximum frequency, which must be positive.
|
||||
/// \param[in] pad Two sided padding of signal.
|
||||
/// \param[in] n_mels Number of mel filter, which must be positive.
|
||||
/// \param[in] window A function to create a window tensor that is applied/multiplied to each frame/window.
|
||||
/// \param[in] power Exponent for the magnitude spectrogram, (must be > 0) e.g., 1 for energy, 2 for power, etc.
|
||||
/// \param[in] normalized Whether to normalize by magnitude after stft.
|
||||
/// \param[in] center Whether to pad waveform on both sides.
|
||||
/// \param[in] pad_mode controls the padding method used when center is True.
|
||||
/// \param[in] onesided controls whether to return half of results to avoid redundancy.
|
||||
/// \param[in] norm If 'slaney', divide the triangular mel weights by the width of the mel band (area normalization).
|
||||
/// \param[in] mel_scale Scale to use: htk or slaney.
|
||||
/// \return Status return code.
|
||||
Status MelSpectrogram(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate,
|
||||
int32_t n_fft, int32_t win_length, int32_t hop_length, float f_min, float f_max, int32_t pad,
|
||||
int32_t n_mels, WindowType window, float power, bool normalized, bool center, BorderType pad_mode,
|
||||
bool onesided, NormType norm, MelType mel_scale);
|
||||
|
||||
/// \brief Create MFCC for a raw audio signal.
|
||||
/// \param[in] input Input tensor.
|
||||
/// \param[out] output Output tensor.
|
||||
/// \param[in] sample_rate Sample rate of audio signal.
|
||||
/// \param[in] n_mfcc Number of mfc coefficients to retain.
|
||||
/// \param[in] dct_type Type of DCT (discrete cosine transform) to use.
|
||||
/// \param[in] log_mels Whether to use log-mel spectrograms instead of db-scaled.
|
||||
/// \param[in] n_fft Size of FFT, creates n_fft // 2 + 1 bins.
|
||||
/// \param[in] win_length Window size.
|
||||
/// \param[in] hop_length Length of hop between STFT windows.
|
||||
/// \param[in] f_min Minimum frequency.
|
||||
/// \param[in] f_max Maximum frequency.
|
||||
/// \param[in] pad Two sided padding of signal.
|
||||
/// \param[in] n_mels Number of mel filterbanks.
|
||||
/// \param[in] window A function to create a window tensor that is applied/multiplied to each frame/window.
|
||||
/// \param[in] power Exponent for the magnitude spectrogram, (must be > 0) e.g., 1 for energy, 2 for power, etc.
|
||||
/// \param[in] normalized Whether to normalize by magnitude after stft.
|
||||
/// \param[in] center Whether to pad waveform on both sides.
|
||||
/// \param[in] pad_mode Controls the padding method used when center is True.
|
||||
/// \param[in] onesided Controls whether to return half of results to avoid redundancy.
|
||||
/// \param[in] norm Norm to use.
|
||||
/// \param[in] norm_M If 'slaney', divide the triangular mel weights by the width of the mel band (area normalization).
|
||||
/// \param[in] mel_scale Scale to use: htk or slaney.
|
||||
/// \return Status return code.
|
||||
Status MFCC(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate, int32_t n_mfcc,
|
||||
int32_t dct_type, bool log_mels, int32_t n_fft, int32_t win_length, int32_t hop_length, float f_min,
|
||||
float f_max, int32_t pad, int32_t n_mels, WindowType window, float power, bool normalized, bool center,
|
||||
BorderType pad_mode, bool onesided, NormType norm, NormMode norm_M, MelType mel_scale);
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_AUDIO_UTILS_H_
|
||||
|
|
|
@ -0,0 +1,57 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "minddata/dataset/audio/kernels/mfcc_op.h"
|
||||
|
||||
#include "minddata/dataset/audio/kernels/audio_utils.h"
|
||||
#include "minddata/dataset/core/tensor.h"
|
||||
#include "minddata/dataset/kernels/data/data_utils.h"
|
||||
#include "minddata/dataset/kernels/tensor_op.h"
|
||||
#include "minddata/dataset/util/status.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
Status MFCCOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
|
||||
IO_CHECK(input, output);
|
||||
return MFCC(input, output, sample_rate_, n_mfcc_, dct_type_, log_mels_, n_fft_, win_length_, hop_length_, f_min_,
|
||||
f_max_, pad_, n_mels_, window_, power_, normalized_, center_, pad_mode_, onesided_, norm_, norm_M_,
|
||||
mel_scale_);
|
||||
}
|
||||
|
||||
Status MFCCOp::OutputShape(const std::vector<TensorShape> &inputs, std::vector<TensorShape> &outputs) {
|
||||
RETURN_IF_NOT_OK(TensorOp::OutputShape(inputs, outputs));
|
||||
outputs.clear();
|
||||
auto output_shape_vector = inputs[0].AsVector();
|
||||
auto time = output_shape_vector[output_shape_vector.size()];
|
||||
output_shape_vector.pop_back();
|
||||
output_shape_vector.push_back(n_mfcc_);
|
||||
output_shape_vector.push_back(time);
|
||||
TensorShape out = TensorShape(output_shape_vector);
|
||||
outputs.emplace_back(out);
|
||||
if (!outputs.empty()) {
|
||||
return Status::OK();
|
||||
}
|
||||
return Status(StatusCode::kMDUnexpectedError, "MFCC: input tensor is not in shape of <..., time>.");
|
||||
}
|
||||
|
||||
Status MFCCOp::OutputType(const std::vector<DataType> &inputs, std::vector<DataType> &outputs) {
|
||||
RETURN_IF_NOT_OK(TensorOp::OutputType(inputs, outputs));
|
||||
RETURN_IF_NOT_OK(ValidateTensorType("MFCC", inputs[0].IsNumeric(), "[float]", inputs[0].ToString()));
|
||||
outputs[0] = DataType(DataType::DE_FLOAT32);
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,113 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_MFCC_OP_H_
|
||||
#define MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_MFCC_OP_H_
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "include/dataset/constants.h"
|
||||
#include "minddata/dataset/core/tensor.h"
|
||||
#include "minddata/dataset/kernels/tensor_op.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
class MFCCOp : public TensorOp {
|
||||
public:
|
||||
/// \brief Constructor.
|
||||
/// \param[in] sample_rate Sample rate of audio signal.
|
||||
/// \param[in] n_mfcc Number of mfc coefficients to retain.
|
||||
/// \param[in] dct_type Type of DCT (discrete cosine transform) to use.
|
||||
/// \param[in] log_mels Whether to use log-mel spectrograms instead of db-scaled.
|
||||
/// \param[in] n_fft Size of FFT, creates n_fft // 2 + 1 bins.
|
||||
/// \param[in] win_length Window size.
|
||||
/// \param[in] hop_length Length of hop between STFT windows.
|
||||
/// \param[in] f_min Minimum frequency.
|
||||
/// \param[in] f_max Maximum frequency.
|
||||
/// \param[in] pad Two sided padding of signal.
|
||||
/// \param[in] n_mels Number of mel filterbanks.
|
||||
/// \param[in] window A function to create a window tensor that is applied/multiplied to each frame/window.
|
||||
/// \param[in] power Exponent for the magnitude spectrogram, (must be > 0) e.g., 1 for energy, 2 for power, etc.
|
||||
/// \param[in] normalized Whether to normalize by magnitude after stft.
|
||||
/// \param[in] center Whether to pad waveform on both sides.
|
||||
/// \param[in] pad_mode Controls the padding method used when center is True.
|
||||
/// \param[in] onesided Controls whether to return half of results to avoid redundancy.
|
||||
/// \param[in] norm Norm to use.
|
||||
/// \param[in] norm_M If 'slaney', divide the triangular mel weights by the width of the mel band (area
|
||||
/// normalization).
|
||||
/// \param[in] mel_scale Scale to use: htk or slaney.
|
||||
MFCCOp(int32_t sample_rate, int32_t n_mfcc, int32_t dct_type, bool log_mels, int32_t n_fft, int32_t win_length,
|
||||
int32_t hop_length, float f_min, float f_max, int32_t pad, int32_t n_mels, WindowType window, float power,
|
||||
bool normalized, bool center, BorderType pad_mode, bool onesided, NormType norm, NormMode norm_M,
|
||||
MelType mel_scale)
|
||||
: sample_rate_(sample_rate),
|
||||
n_mfcc_(n_mfcc),
|
||||
dct_type_(dct_type),
|
||||
log_mels_(log_mels),
|
||||
n_fft_(n_fft),
|
||||
win_length_(win_length),
|
||||
hop_length_(hop_length),
|
||||
f_min_(f_min),
|
||||
f_max_(f_max),
|
||||
pad_(pad),
|
||||
n_mels_(n_mels),
|
||||
window_(window),
|
||||
power_(power),
|
||||
normalized_(normalized),
|
||||
center_(center),
|
||||
pad_mode_(pad_mode),
|
||||
onesided_(onesided),
|
||||
norm_(norm),
|
||||
norm_M_(norm_M),
|
||||
mel_scale_(mel_scale) {}
|
||||
|
||||
~MFCCOp() override = default;
|
||||
|
||||
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
|
||||
|
||||
std::string Name() const override { return kMFCCOp; }
|
||||
|
||||
Status OutputShape(const std::vector<TensorShape> &inputs, std::vector<TensorShape> &outputs) override;
|
||||
|
||||
Status OutputType(const std::vector<DataType> &inputs, std::vector<DataType> &outputs) override;
|
||||
|
||||
private:
|
||||
int32_t sample_rate_;
|
||||
int32_t n_mfcc_;
|
||||
int32_t dct_type_;
|
||||
bool log_mels_;
|
||||
int32_t n_fft_;
|
||||
int32_t win_length_;
|
||||
int32_t hop_length_;
|
||||
float f_min_;
|
||||
float f_max_;
|
||||
int32_t pad_;
|
||||
int32_t n_mels_;
|
||||
WindowType window_;
|
||||
float power_;
|
||||
bool normalized_;
|
||||
bool center_;
|
||||
BorderType pad_mode_;
|
||||
bool onesided_;
|
||||
NormType norm_;
|
||||
NormMode norm_M_;
|
||||
MelType mel_scale_;
|
||||
};
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_MFCC_OP_H_
|
|
@ -867,6 +867,52 @@ Status DATASET_API MelscaleFbanks(MSTensor *output, int32_t n_freqs, float f_min
|
|||
int32_t sample_rate, NormType norm = NormType::kNone,
|
||||
MelType mel_type = MelType::kHtk);
|
||||
|
||||
/// \brief Create MFCC for a raw audio signal.
|
||||
class DATASET_API MFCC final : public TensorTransform {
|
||||
public:
|
||||
/// \param[in] sample_rate Sample rate of audio signal. Default: 16000.
|
||||
/// \param[in] n_mfcc Number of mfc coefficients to retain. Default: 40.
|
||||
/// \param[in] dct_type Type of DCT (discrete cosine transform) to use. Default: 2.
|
||||
/// \param[in] norm If 'slaney', divide the triangular mel weights by the width of the mel band (area normalization).
|
||||
/// Default: NormMode::kOrtho.
|
||||
/// \param[in] log_mels Whether to use log-mel spectrograms instead of db-scaled. Default: false.
|
||||
/// \param[in] n_fft Size of FFT, creates n_fft // 2 + 1 bins. Default: 400.
|
||||
/// \param[in] win_length Window size. Default: 0.
|
||||
/// \param[in] hop_length Length of hop between STFT windows. Default: 0.
|
||||
/// \param[in] f_min Minimum frequency. Default: 0.
|
||||
/// \param[in] f_max Maximum frequency. Default: 0.
|
||||
/// \param[in] pad Two sided padding of signal. Default: 0.
|
||||
/// \param[in] n_mels Number of mel filterbanks. Default: 128.
|
||||
/// \param[in] window A function to create a window tensor that is applied/multiplied to each frame/window.
|
||||
/// Default: WindowType::kHann.
|
||||
/// \param[in] power Exponent for the magnitude spectrogram, (must be > 0) e.g., 1 for energy, 2 for power, etc.
|
||||
/// Default: 2.0.
|
||||
/// \param[in] normalized Whether to normalize by magnitude after stft. Default: false.
|
||||
/// \param[in] center Whether to pad waveform on both sides. Default: true.
|
||||
/// \param[in] pad_mode Controls the padding method used when center is True. Default: BorderType::kReflect.
|
||||
/// \param[in] onesided Controls whether to return half of results to avoid redundancy. Default: true.
|
||||
/// \param[in] norm_mel Norm to use. Default: NormType::kNone.
|
||||
/// \param[in] mel_scale Scale to use: htk or slaney. Default: MelType::kHtk.
|
||||
explicit MFCC(int32_t sample_rate = 16000, int32_t n_mfcc = 40, int32_t dct_type = 2,
|
||||
NormMode norm = NormMode::kOrtho, bool log_mels = false, int32_t n_fft = 400, int32_t win_length = 0,
|
||||
int32_t hop_length = 0, float f_min = 0, float f_max = 0, int32_t pad = 0, int32_t n_mels = 128,
|
||||
WindowType window = WindowType::kHann, float power = 2.0, bool normalized = false, bool center = true,
|
||||
BorderType pad_mode = BorderType::kReflect, bool onesided = true, NormType norm_mel = NormType::kNone,
|
||||
MelType mel_scale = MelType::kHtk);
|
||||
|
||||
/// \brief Destructor.
|
||||
~MFCC() override = default;
|
||||
|
||||
protected:
|
||||
/// \brief Function to convert TensorTransform object into a TensorOperation object.
|
||||
/// \return Shared pointer to TensorOperation object.
|
||||
std::shared_ptr<TensorOperation> Parse() override;
|
||||
|
||||
private:
|
||||
struct Data;
|
||||
std::shared_ptr<Data> data_;
|
||||
};
|
||||
|
||||
/// \brief MuLawDecoding TensorTransform.
|
||||
/// \note Decode mu-law encoded signal.
|
||||
class DATASET_API MuLawDecoding final : public TensorTransform {
|
||||
|
|
|
@ -192,6 +192,7 @@ constexpr char kMagphaseOp[] = "MagphaseOp";
|
|||
constexpr char kMaskAlongAxisIIDOp[] = "MaskAlongAxisIIDOp";
|
||||
constexpr char kMaskAlongAxisOp[] = "MaskAlongAxisOp";
|
||||
constexpr char kMelScaleOp[] = "MelScaleOp";
|
||||
constexpr char kMFCCOp[] = "MFCCOp";
|
||||
constexpr char kMuLawDecodingOp[] = "MuLawDecodingOp";
|
||||
constexpr char kMuLawEncodingOp[] = "MuLawEncodingOp";
|
||||
constexpr char kOverdriveOp[] = "OverdriveOp";
|
||||
|
|
|
@ -68,7 +68,7 @@ from mindspore.dataset.audio.transforms import AllpassBiquad, AmplitudeToDB, Ang
|
|||
BandpassBiquad, BandrejectBiquad, BassBiquad, Biquad, \
|
||||
ComplexNorm, ComputeDeltas, Contrast, DBToAmplitude, DCShift, DeemphBiquad, DetectPitchFrequency, Dither, \
|
||||
EqualizerBiquad, Fade, Flanger, FrequencyMasking, Gain, GriffinLim, HighpassBiquad, InverseMelScale, LFCC, \
|
||||
LFilter, LowpassBiquad, Magphase, MaskAlongAxis, MaskAlongAxisIID, MelScale, MuLawDecoding, MuLawEncoding, \
|
||||
LFilter, LowpassBiquad, Magphase, MaskAlongAxis, MaskAlongAxisIID, MelScale, MFCC, MuLawDecoding, MuLawEncoding, \
|
||||
Overdrive, Phaser, PhaseVocoder, Resample, RiaaBiquad, SlidingWindowCmn, SpectralCentroid, Spectrogram, \
|
||||
TimeMasking, TimeStretch, TrebleBiquad, Vad, Vol
|
||||
from mindspore.dataset.audio.utils import BorderType, DensityFunction, FadeShape, GainType, Interpolation, \
|
||||
|
|
|
@ -29,9 +29,10 @@ from .validators import check_allpass_biquad, check_amplitude_to_db, check_band_
|
|||
check_contrast, check_db_to_amplitude, check_dc_shift, check_deemph_biquad, check_detect_pitch_frequency, \
|
||||
check_dither, check_equalizer_biquad, check_fade, check_flanger, check_gain, check_griffin_lim, \
|
||||
check_highpass_biquad, check_inverse_mel_scale, check_lfcc, check_lfilter, check_lowpass_biquad, check_magphase, \
|
||||
check_mask_along_axis, check_mask_along_axis_iid, check_masking, check_mel_scale, check_mu_law_coding, \
|
||||
check_overdrive, check_phase_vocoder, check_phaser, check_resample, check_riaa_biquad, check_sliding_window_cmn, \
|
||||
check_spectral_centroid, check_spectrogram, check_time_stretch, check_treble_biquad, check_vad, check_vol
|
||||
check_mask_along_axis, check_mask_along_axis_iid, check_masking, check_mel_scale, check_mfcc, \
|
||||
check_mu_law_coding, check_overdrive, check_phase_vocoder, check_phaser, check_resample, check_riaa_biquad, \
|
||||
check_sliding_window_cmn, check_spectral_centroid, check_spectrogram, check_time_stretch, check_treble_biquad, \
|
||||
check_vad, check_vol
|
||||
from ..transforms.py_transforms_util import Implementation
|
||||
from ..transforms.transforms import TensorOperation
|
||||
|
||||
|
@ -1516,6 +1517,83 @@ class MelScale(AudioTensorOperation):
|
|||
DE_C_NORM_TYPE.get(self.norm), DE_C_MEL_TYPE.get(self.mel_type))
|
||||
|
||||
|
||||
class MFCC(AudioTensorOperation):
|
||||
"""
|
||||
Create MFCC for a raw audio signal.
|
||||
|
||||
Args:
|
||||
sample_rate (int, optional): Sampling rate of audio signal (in Hz), can't be less than 0. Default: 16000.
|
||||
n_mfcc (int, optional): Number of mfc coefficients to retain, can't be less than 0. Default: 40.
|
||||
dct_type (int, optional): Type of DCT (discrete cosine transform) to use, can only be 2. Default: 2.
|
||||
norm (NormMode, optional): Norm to use. Default: NormMode.ORTHO.
|
||||
log_mels (bool, optional): Whether to use log-mel spectrograms instead of db-scaled. Default: False.
|
||||
melkwargs (dict, optional): Arguments for Spectrogram. Default: None, will be set to
|
||||
`{'n_fft': 400, 'win_length': n_fft, 'hop_length': win_length // 2, 'f_min' : 0.0,
|
||||
'f_max' : sample_rate // 2, 'pad': 0, 'window': WindowType.HANN, 'power': 2.0, 'normalized': False,
|
||||
'center': True, 'pad_mode': BorderType.REFLECT, 'onesided': True, 'norm' : NormType.NONE,
|
||||
'mel_scale' : MelType.HTK}` .
|
||||
|
||||
Raises:
|
||||
TypeError: If `sample_rate` is not of type int.
|
||||
TypeError: If `log_mels` is not of type bool.
|
||||
TypeError: If `norm` is not of type :class:`mindspore.dataset.audio.utils.NormMode` .
|
||||
TypeError: If `n_mfcc` is not of type int.
|
||||
TypeError: If `melkwargs` is not of type dict.
|
||||
ValueError: If `sample_rate` is a negative number.
|
||||
ValueError: If `n_mfcc` is a negative number.
|
||||
ValueError: If `dct_type` is not 2.
|
||||
|
||||
Supported Platforms:
|
||||
``CPU``
|
||||
|
||||
Examples:
|
||||
>>> import numpy as np
|
||||
>>>
|
||||
>>> waveform = np.array([[0.8236, 0.2049, 0.3335], [0.5933, 0.9911, 0.2482],
|
||||
... [0.3007, 0.9054, 0.7598], [0.5394, 0.2842, 0.5634], [0.6363, 0.2226, 0.2288]])
|
||||
>>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
|
||||
>>> transforms = [audio.MFCC(4000, 1500, 0.7)]
|
||||
>>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
|
||||
"""
|
||||
|
||||
@check_mfcc
|
||||
def __init__(self, sample_rate=16000, n_mfcc=40, dct_type=2, norm=NormMode.ORTHO, log_mels=False, melkwargs=None):
|
||||
super().__init__()
|
||||
self.sample_rate = sample_rate
|
||||
self.n_mfcc = n_mfcc
|
||||
self.dct_type = dct_type
|
||||
self.norm = norm
|
||||
self.log_mels = log_mels
|
||||
self.melkwargs = melkwargs
|
||||
if melkwargs is None:
|
||||
self.melkwargs = {}
|
||||
self.melkwargs.setdefault("n_fft", 400)
|
||||
self.melkwargs.setdefault("win_length", self.melkwargs.get("n_fft"))
|
||||
self.melkwargs.setdefault("hop_length", self.melkwargs.get("win_length") // 2)
|
||||
self.melkwargs.setdefault("f_min", 0.0)
|
||||
self.melkwargs.setdefault("f_max", sample_rate // 2)
|
||||
self.melkwargs.setdefault("pad", 0)
|
||||
self.melkwargs.setdefault("n_mels", 128)
|
||||
self.melkwargs.setdefault("window", WindowType.HANN)
|
||||
self.melkwargs.setdefault("power", 2.0)
|
||||
self.melkwargs.setdefault("normalized", False)
|
||||
self.melkwargs.setdefault("center", True)
|
||||
self.melkwargs.setdefault("pad_mode", BorderType.REFLECT)
|
||||
self.melkwargs.setdefault("onesided", True)
|
||||
self.melkwargs.setdefault("norm", NormType.NONE)
|
||||
self.melkwargs.setdefault("mel_scale", MelType.HTK)
|
||||
self.window = self.melkwargs.get("window")
|
||||
self.pad_mode = self.melkwargs.get("pad_mode")
|
||||
self.norm_mel = self.melkwargs.get("norm")
|
||||
self.mel_scale = self.melkwargs.get("mel_scale")
|
||||
|
||||
def parse(self):
|
||||
return cde.MFCCOperation(self.sample_rate, self.n_mfcc, self.dct_type, DE_C_NORM_MODE.get(self.norm),
|
||||
self.log_mels, self.melkwargs, DE_C_WINDOW_TYPE.get(self.window),
|
||||
DE_C_BORDER_TYPE.get(self.pad_mode), DE_C_NORM_TYPE.get(self.norm_mel),
|
||||
DE_C_MEL_TYPE.get(self.mel_scale))
|
||||
|
||||
|
||||
class MuLawDecoding(AudioTensorOperation):
|
||||
"""
|
||||
Decode mu-law encoded signal, refer to `mu-law algorithm <https://en.wikipedia.org/wiki/M-law_algorithm>`_ .
|
||||
|
|
|
@ -1006,3 +1006,61 @@ def check_lfcc(method):
|
|||
return method(self, *args, **kwargs)
|
||||
|
||||
return new_method
|
||||
|
||||
|
||||
def check_mfcc(method):
|
||||
"""Wrapper method to check the parameters of MFCC."""
|
||||
|
||||
@wraps(method)
|
||||
def new_method(self, *args, **kwargs):
|
||||
[sample_rate, n_mfcc, dct_type, norm, log_mels, melkwargs], _ = parse_user_args(method, *args, **kwargs)
|
||||
check_non_negative_int32(sample_rate, "sample_rate")
|
||||
type_check(log_mels, (bool,), "log_mels")
|
||||
type_check(norm, (NormMode,), "norm")
|
||||
check_non_negative_int32(n_mfcc, "n_mfcc")
|
||||
if dct_type != 2:
|
||||
raise ValueError("Input dct_type must be 2, but got : {0}.".format(dct_type))
|
||||
|
||||
if melkwargs is not None:
|
||||
type_check(melkwargs, (dict,), "melkwargs")
|
||||
n_fft = melkwargs["n_fft"]
|
||||
win_length = melkwargs["win_length"]
|
||||
hop_length = melkwargs["hop_length"]
|
||||
f_min = melkwargs["f_min"]
|
||||
f_max = melkwargs["f_max"]
|
||||
pad = melkwargs["pad"]
|
||||
power = melkwargs["power"]
|
||||
normalized = melkwargs["normalized"]
|
||||
center = melkwargs["center"]
|
||||
onesided = melkwargs["onesided"]
|
||||
window = melkwargs["window"]
|
||||
pad_mode = melkwargs["pad_mode"]
|
||||
norm_mel = melkwargs["norm"]
|
||||
mel_scale = melkwargs["mel_scale"]
|
||||
n_mels = melkwargs["n_mels"]
|
||||
|
||||
check_pos_int32(n_fft, "n_fft")
|
||||
check_mel_scale_n_mels(n_mels)
|
||||
check_mel_scale_freq(f_min, f_max, sample_rate)
|
||||
check_mel_scale_norm(norm_mel)
|
||||
check_mel_scale_mel_type(mel_scale)
|
||||
check_power(power)
|
||||
type_check(window, (WindowType,), "window")
|
||||
type_check(normalized, (bool,), "normalized")
|
||||
type_check(center, (bool,), "center")
|
||||
type_check(pad_mode, (BorderType,), "pad_mode")
|
||||
type_check(onesided, (bool,), "onesided")
|
||||
check_non_negative_int32(pad, "pad")
|
||||
if hop_length is not None:
|
||||
check_pos_int32(hop_length, "hop_length")
|
||||
if f_max is not None:
|
||||
check_non_negative_float32(f_max, "f_max")
|
||||
if win_length is not None:
|
||||
check_non_negative_int32(win_length, "win_length")
|
||||
if n_mels < n_mfcc:
|
||||
raise ValueError("Input n_mels should be greater than or equal to n_mfcc, but got n_mfcc: {0} and " \
|
||||
"n_mels: {1}.".format(n_mfcc, n_mels))
|
||||
|
||||
return method(self, *args, **kwargs)
|
||||
|
||||
return new_method
|
||||
|
|
|
@ -3148,3 +3148,56 @@ TEST_F(MindDataTestPipeline, TestLFCCWrongArgs) {
|
|||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_EQ(iter, nullptr);
|
||||
}
|
||||
|
||||
/// Feature: MFCC op
|
||||
/// Description: Test pipeline for MFCC op
|
||||
/// Expectation: Generate expected output after cases were executed
|
||||
TEST_F(MindDataTestPipeline, TestMFCCPipeline) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMFCCPipeline.";
|
||||
// Original waveform
|
||||
std::shared_ptr<SchemaObj> schema = Schema();
|
||||
ASSERT_OK(schema->add_column("waveform", mindspore::DataType::kNumberTypeFloat32, {1, 1, 300}));
|
||||
std::shared_ptr<Dataset> ds = RandomData(10, schema);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
ds = ds->SetNumWorkers(4);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
auto mfcc_op1 = audio::MFCC(16000, 40, 2, NormMode::kOrtho, true);
|
||||
ds = ds->Map({mfcc_op1});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(ds, nullptr);
|
||||
std::unordered_map<std::string, mindspore::MSTensor> row;
|
||||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
std::vector<int64_t> expected = {1, 1, 40, 2};
|
||||
int i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto col = row["waveform"];
|
||||
ASSERT_EQ(col.Shape(), expected);
|
||||
ASSERT_EQ(col.Shape().size(), 4);
|
||||
ASSERT_EQ(col.DataType(), mindspore::DataType::kNumberTypeFloat32);
|
||||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
i++;
|
||||
}
|
||||
EXPECT_EQ(i, 10);
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
/// Feature: MFCC op
|
||||
/// Description: Test wrong arguments for MFCC op
|
||||
/// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
|
||||
TEST_F(MindDataTestPipeline, TestMFCCWrongArgs) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestInverseMelScaleWrongArgs.";
|
||||
// MFCC: negative sample_rate.
|
||||
std::shared_ptr<SchemaObj> schema = Schema();
|
||||
ASSERT_OK(schema->add_column("waveform", mindspore::DataType::kNumberTypeFloat32, {1, 1, 300}));
|
||||
std::shared_ptr<Dataset> ds = RandomData(10, schema);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
ds = ds->SetNumWorkers(4);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
auto mfcc_op0 = audio::MFCC(-1);
|
||||
ds = ds->Map({mfcc_op0});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_EQ(iter, nullptr);
|
||||
}
|
||||
|
|
|
@ -3053,3 +3053,22 @@ TEST_F(MindDataTestExecute, TestTruncateOpStr) {
|
|||
Status status = trans(input_ms, &input_ms);
|
||||
EXPECT_TRUE(status.IsOk());
|
||||
}
|
||||
|
||||
/// Feature: MFCC op
|
||||
/// Description: Test basic usage of MFCC op
|
||||
/// Expectation: The data is processed successfully
|
||||
TEST_F(MindDataTestExecute, TestMFCCEager) {
|
||||
MS_LOG(INFO) << "Doing MindDataTestExecute-TestMFCC.";
|
||||
// Original waveform
|
||||
std::vector<float> labels = {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 4, 4, 3, 3, 2,
|
||||
2, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5};
|
||||
std::shared_ptr<Tensor> input;
|
||||
ASSERT_OK(Tensor::CreateFromVector(labels, TensorShape({1, 1, 30}), &input));
|
||||
auto input_ms = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(input));
|
||||
std::shared_ptr<TensorTransform> mfcc_op =
|
||||
std::make_shared<audio::MFCC>(16000, 4, 2, NormMode::kOrtho, true, 10);
|
||||
// apply MFCC
|
||||
mindspore::dataset::Execute trans({mfcc_op});
|
||||
Status status = trans(input_ms, &input_ms);
|
||||
EXPECT_TRUE(status.IsOk());
|
||||
}
|
||||
|
|
|
@ -0,0 +1,281 @@
|
|||
# Copyright 2022 Huawei Technologies Co., Ltd :
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""
|
||||
Testing MFCC Python API
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.audio as audio
|
||||
from mindspore import log as logger
|
||||
from mindspore.dataset.audio.utils import WindowType, BorderType, MelType, NormType, NormMode
|
||||
|
||||
|
||||
def count_unequal_element(data_expected, data_me, rtol, atol):
|
||||
""" Precision calculation func """
|
||||
assert data_expected.shape == data_me.shape
|
||||
total_count = len(data_expected.flatten())
|
||||
error = np.abs(data_expected - data_me)
|
||||
greater = np.greater(error, atol + np.abs(data_expected) * rtol)
|
||||
loss_count = np.count_nonzero(greater)
|
||||
assert (loss_count / total_count) < rtol, "\ndata_expected_std:{0}\ndata_me_error:{1}\nloss:{2}".format(
|
||||
data_expected[greater], data_me[greater], error[greater])
|
||||
|
||||
|
||||
def test_mfcc_pipeline():
|
||||
"""
|
||||
Feature: Mindspore pipeline mode normal testcase: mfcc op
|
||||
Description: Input audio signal to test pipeline
|
||||
Expectation: Generate expected output after cases were executed
|
||||
"""
|
||||
logger.info("test_mfcc_pipeline")
|
||||
|
||||
wav = [[[1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5]]]
|
||||
dataset = ds.NumpySlicesDataset(wav, column_names=["audio"], shuffle=False)
|
||||
out = audio.MFCC(sample_rate=16000, n_mfcc=4, dct_type=2, norm=NormMode.ORTHO, log_mels=True,
|
||||
melkwargs={"n_fft": 16, "win_length": 16, "hop_length": 8, "f_min": 0.0,
|
||||
"f_max": 10000.0, "pad": 0, "n_mels": 5, "window": WindowType.HANN, "power": 2.0,
|
||||
"normalized": False, "center": True, "pad_mode": BorderType.REFLECT, "onesided": True,
|
||||
"norm": NormType.NONE, "mel_scale": MelType.HTK})
|
||||
dataset = dataset.map(operations=out, input_columns=["audio"], output_columns=["MFCC"])
|
||||
result = np.array([[[2.7625, 5.6919, 3.6229, 3.9756],
|
||||
[0.8142, 3.2698, 1.4946, 3.0683],
|
||||
[-1.6855, -0.8312, -1.1395, 0.0481],
|
||||
[-2.1808, -2.5489, -2.3110, -3.1485]]])
|
||||
for data1 in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
|
||||
count_unequal_element(data1["MFCC"], result, 0.0001, 0.0001)
|
||||
|
||||
|
||||
def test_mfcc_eager():
|
||||
"""
|
||||
Feature: Mindspore eager mode normal testcase: mfcc op
|
||||
Description: Input audio signal to test eager
|
||||
Expectation: Generate expected output after cases were executed
|
||||
"""
|
||||
logger.info("test_mfcc_eager")
|
||||
wav = np.array([[[1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5]]])
|
||||
out = audio.MFCC(sample_rate=16000, n_mfcc=4, dct_type=2, norm=NormMode.ORTHO, log_mels=True,
|
||||
melkwargs={"n_fft": 16, "win_length": 16, "hop_length": 8, "f_min": 0.0, "f_max": 10000.0,
|
||||
"pad": 0, "n_mels": 5, "window": WindowType.HANN, "power": 2.0, "normalized": False,
|
||||
"center": True, "pad_mode": BorderType.REFLECT, "onesided": True,
|
||||
"norm": NormType.NONE, "mel_scale": MelType.HTK})(wav)
|
||||
result = np.array([[[[2.7625, 5.6919, 3.6229, 3.9756],
|
||||
[0.8142, 3.2698, 1.4946, 3.0683],
|
||||
[-1.6855, -0.8312, -1.1395, 0.0481],
|
||||
[-2.1808, -2.5489, -2.3110, -3.1485]]]])
|
||||
count_unequal_element(out, result, 0.0001, 0.0001)
|
||||
|
||||
|
||||
def test_mfcc_param():
|
||||
"""
|
||||
Feature: Test mfcc invalid parameter.
|
||||
Description: Test some invalid parameters.
|
||||
Expectation: throw ValueError, TypeError or RuntimeError exception.
|
||||
"""
|
||||
try:
|
||||
_ = audio.MFCC(sample_rate=-1)
|
||||
except ValueError as error:
|
||||
logger.info("Got an exception in MFCC: {}".format(str(error)))
|
||||
assert "Input sample_rate is not within the required interval of [0, 2147483647]." in str(error)
|
||||
try:
|
||||
_ = audio.MFCC(log_mels=-1)
|
||||
except TypeError as error:
|
||||
logger.info("Got an exception in MFCC: {}".format(str(error)))
|
||||
assert "Argument log_mels with value -1 is not of type [<class 'bool'>], but got <class 'int'>." in str(error)
|
||||
try:
|
||||
_ = audio.MFCC(norm="Karl Marx")
|
||||
except TypeError as error:
|
||||
logger.info("Got an exception in MFCC: {}".format(str(error)))
|
||||
assert "Argument norm with value Karl Marx is not of type [<enum 'NormMode'>], but got <class 'str'>." \
|
||||
in str(error)
|
||||
try:
|
||||
_ = audio.MFCC(dct_type=-1)
|
||||
except ValueError as error:
|
||||
logger.info("Got an exception in MFCC: {}".format(str(error)))
|
||||
assert "dct_type must be 2, but got : -1." in str(error)
|
||||
try:
|
||||
_ = audio.MFCC(sample_rate=-1)
|
||||
except ValueError as error:
|
||||
logger.info("Got an exception in MFCC: {}".format(str(error)))
|
||||
assert "Input sample_rate is not within the required interval of [0, 2147483647]." in str(error)
|
||||
try:
|
||||
_ = audio.MFCC(sample_rate="s")
|
||||
except TypeError as error:
|
||||
logger.info("Got an exception in MFCC: {}".format(str(error)))
|
||||
assert "Argument sample_rate with value s is not of type [<class 'int'>], but got <class 'str'>." in str(error)
|
||||
try:
|
||||
_ = audio.MFCC(melkwargs={"n_fft": 16, "win_length": 16, "hop_length": 8, "f_min": 0.0, "f_max": -1,
|
||||
"pad": 0, "n_mels": 5, "window": WindowType.HANN, "power": 2.0, "normalized": True,
|
||||
"center": True, "pad_mode": BorderType.REFLECT, "onesided": True,
|
||||
"norm": NormType.NONE, "mel_scale": MelType.HTK})
|
||||
except ValueError as error:
|
||||
logger.info("Got an exception in MFCC: {}".format(str(error)))
|
||||
assert "Input f_max is not within the required interval of (0, 16777216]." in str(error)
|
||||
try:
|
||||
_ = audio.MFCC(melkwargs={"n_fft": 16, "win_length": 16, "hop_length": 8, "f_min": -1, "f_max": 10000.0,
|
||||
"pad": 0, "n_mels": 5, "window": WindowType.HANN, "power": 2.0, "normalized": True,
|
||||
"center": True, "pad_mode": BorderType.REFLECT, "onesided": True,
|
||||
"norm": NormType.NONE, "mel_scale": MelType.HTK})
|
||||
except ValueError as error:
|
||||
logger.info("Got an exception in MFCC: {}".format(str(error)))
|
||||
assert "Input n_mels should be greater than or equal to n_mfcc, but got n_mfcc: 40 and n_mels: 5." in str(error)
|
||||
try:
|
||||
_ = audio.MFCC(melkwargs={"n_fft": 16, "win_length": 16, "hop_length": 8, "f_min": 0.0, "f_max": 10000.0,
|
||||
"pad": 0, "n_mels": 5, "window": WindowType.HANN, "power": 2.0, "normalized": True,
|
||||
"center": True, "pad_mode": BorderType.REFLECT, "onesided": True,
|
||||
"norm": -1, "mel_scale": MelType.HTK})
|
||||
except TypeError as error:
|
||||
logger.info("Got an exception in MFCC: {}".format(str(error)))
|
||||
assert "Argument norm with value -1 is not of type [<enum 'NormType'>], but got <class 'int'>." in str(error)
|
||||
try:
|
||||
_ = audio.MFCC(melkwargs={"n_fft": 16, "win_length": 16, "hop_length": 8, "f_min": 0.0, "f_max": 10000.0,
|
||||
"pad": 0, "n_mels": 5, "window": WindowType.HANN, "power": 2.0, "normalized": True,
|
||||
"center": True, "pad_mode": BorderType.REFLECT, "onesided": True,
|
||||
"norm": NormType.NONE, "mel_scale": -1})
|
||||
except TypeError as error:
|
||||
logger.info("Got an exception in MFCC: {}".format(str(error)))
|
||||
assert "Argument mel_type with value -1 is not of type [<enum 'MelType'>], but got <class 'int'>." in str(error)
|
||||
try:
|
||||
_ = audio.MFCC(melkwargs={"n_fft": -1, "win_length": 16, "hop_length": 8, "f_min": 0.0, "f_max": 10000.0,
|
||||
"pad": 0, "n_mels": 5, "window": WindowType.HANN, "power": 2.0, "normalized": True,
|
||||
"center": True, "pad_mode": BorderType.REFLECT, "onesided": True,
|
||||
"norm": NormType.NONE, "mel_scale": MelType.HTK})
|
||||
except ValueError as error:
|
||||
logger.info("Got an exception in MFCC: {}".format(str(error)))
|
||||
assert "Input n_fft is not within the required interval of [1, 2147483647]." in str(error)
|
||||
try:
|
||||
_ = audio.MFCC(melkwargs={"n_fft": 0, "win_length": 16, "hop_length": 8, "f_min": 0.0, "f_max": 10000.0,
|
||||
"pad": 0, "n_mels": 5, "window": WindowType.HANN, "power": 2.0, "normalized": True,
|
||||
"center": True, "pad_mode": BorderType.REFLECT, "onesided": True,
|
||||
"norm": NormType.NONE, "mel_scale": MelType.HTK})
|
||||
except ValueError as error:
|
||||
logger.info("Got an exception in MFCC: {}".format(str(error)))
|
||||
assert "Input n_fft is not within the required interval of [1, 2147483647]." in str(error)
|
||||
try:
|
||||
_ = audio.MFCC(melkwargs={"n_fft": 16, "win_length": 0, "hop_length": 8, "f_min": 0.0, "f_max": 10000.0,
|
||||
"pad": 0, "n_mels": 50, "window": WindowType.HANN, "power": 2.0, "normalized": True,
|
||||
"center": True, "pad_mode": BorderType.REFLECT, "onesided": True,
|
||||
"norm": NormType.NONE, "mel_scale": MelType.HTK})
|
||||
except ValueError as error:
|
||||
logger.info("Got an exception in MFCC: {}".format(str(error)))
|
||||
assert "Input win_length is not within the required interval of [0, 2147483647]." in str(error)
|
||||
try:
|
||||
_ = audio.MFCC(melkwargs={"n_fft": 16, "win_length": "s", "hop_length": 8, "f_min": 0.0, "f_max": 10000.0,
|
||||
"pad": 0, "n_mels": 5, "window": WindowType.HANN, "power": 2.0, "normalized": True,
|
||||
"center": True, "pad_mode": BorderType.REFLECT, "onesided": True,
|
||||
"norm": NormType.NONE, "mel_scale": MelType.HTK})
|
||||
except TypeError as error:
|
||||
logger.info("Got an exception in MFCC: {}".format(str(error)))
|
||||
assert "Argument win_length with value s is not of type [<class 'int'>], but got <class 'str'>." in str(error)
|
||||
try:
|
||||
_ = audio.MFCC(melkwargs={"n_fft": 16, "win_length": 16, "hop_length": -1, "f_min": 0.0, "f_max": 10000.0,
|
||||
"pad": 0, "n_mels": 5, "window": WindowType.HANN, "power": 2.0, "normalized": True,
|
||||
"center": True, "pad_mode": BorderType.REFLECT, "onesided": True,
|
||||
"norm": NormType.NONE, "mel_scale": MelType.HTK})
|
||||
except ValueError as error:
|
||||
logger.info("Got an exception in MFCC: {}".format(str(error)))
|
||||
assert "Input hop_length is not within the required interval of [1, 2147483647]." in str(error)
|
||||
try:
|
||||
_ = audio.MFCC(melkwargs={"n_fft": 200, "win_length": 300, "hop_length": 8, "f_min": 0.0, "f_max": 10000.0,
|
||||
"pad": 0, "n_mels": 50, "window": WindowType.HANN, "power": 2.0, "normalized": True,
|
||||
"center": True, "pad_mode": BorderType.REFLECT, "onesided": True,
|
||||
"norm": NormType.NONE, "mel_scale": MelType.HTK})
|
||||
except ValueError as error:
|
||||
logger.info("Got an exception in MFCC: {}".format(str(error)))
|
||||
assert "Input win_length should be no more than n_fft, but got win_length: 300 and n_fft: 200." \
|
||||
in str(error)
|
||||
try:
|
||||
_ = audio.MFCC(melkwargs={"n_fft": 16, "win_length": 16, "hop_length": 8, "f_min": 0.0, "f_max": 10000.0,
|
||||
"pad": -1, "n_mels": 5, "window": WindowType.HANN, "power": 2.0, "normalized": True,
|
||||
"center": True, "pad_mode": BorderType.REFLECT, "onesided": True,
|
||||
"norm": NormType.NONE, "mel_scale": MelType.HTK})
|
||||
except ValueError as error:
|
||||
logger.info("Got an exception in MFCC: {}".format(str(error)))
|
||||
assert "Input pad is not within the required interval of [0, 2147483647]." in str(error)
|
||||
try:
|
||||
_ = audio.MFCC(melkwargs={"n_fft": 16, "win_length": 16, "hop_length": 8, "f_min": 0.0, "f_max": 10000.0,
|
||||
"pad": 0, "n_mels": 5, "window": WindowType.HANN, "power": -1, "normalized": True,
|
||||
"center": True, "pad_mode": BorderType.REFLECT, "onesided": True,
|
||||
"norm": NormType.NONE, "mel_scale": MelType.HTK})
|
||||
except ValueError as error:
|
||||
logger.info("Got an exception in MFCC: {}".format(str(error)))
|
||||
assert "Input power is not within the required interval of [0, 16777216]." in str(error)
|
||||
try:
|
||||
_ = audio.MFCC(melkwargs={"n_fft": "XiaDanni", "win_length": 16, "hop_length": 8, "f_min": 0.0,
|
||||
"f_max": 10000.0, "pad": 0, "n_mels": 5, "window": WindowType.HANN, "power": 2.0,
|
||||
"normalized": True, "center": True, "pad_mode": BorderType.REFLECT,
|
||||
"onesided": True, "norm": NormType.NONE, "mel_scale": MelType.HTK})
|
||||
except TypeError as error:
|
||||
logger.info("Got an exception in MFCC: {}".format(str(error)))
|
||||
assert "Argument n_fft with value XiaDanni is not of type [<class 'int'>], but got <class 'str'>." \
|
||||
in str(error)
|
||||
try:
|
||||
_ = audio.MFCC(melkwargs={"n_fft": 16, "win_length": 16, "hop_length": 8, "f_min": 0.0, "f_max": 10000.0,
|
||||
"pad": 0, "n_mels": 5, "window": False, "power": 2.0, "normalized": True,
|
||||
"center": True, "pad_mode": BorderType.REFLECT, "onesided": True,
|
||||
"norm": NormType.NONE, "mel_scale": MelType.HTK})
|
||||
except TypeError as error:
|
||||
logger.info("Got an exception in MFCC: {}".format(str(error)))
|
||||
assert "Argument window with value False is not of type [<enum 'WindowType'>], but got <class 'bool'>." \
|
||||
in str(error)
|
||||
try:
|
||||
_ = audio.MFCC(melkwargs={"n_fft": 16, "win_length": 16, "hop_length": 8, "f_min": 0.0, "f_max": 10000.0,
|
||||
"pad": 0, "n_mels": 5, "window": WindowType.HANN, "power": 2.0, "normalized": True,
|
||||
"center": True, "pad_mode": False, "onesided": True, "norm": NormType.NONE,
|
||||
"mel_scale": MelType.HTK})
|
||||
except TypeError as error:
|
||||
logger.info("Got an exception in MFCC: {}".format(str(error)))
|
||||
assert "Argument pad_mode with value False is not of type [<enum 'BorderType'>], but got <class 'bool'>." \
|
||||
in str(error)
|
||||
try:
|
||||
_ = audio.MFCC(melkwargs={"n_fft": 16, "win_length": 16, "hop_length": 8, "f_min": 0.0, "f_max": 10000.0,
|
||||
"pad": 0, "n_mels": 5, "window": WindowType.HANN, "power": 2.0, "normalized": True,
|
||||
"center": True, "pad_mode": BorderType.REFLECT, "onesided": "LianLinghang",
|
||||
"norm": NormType.NONE, "mel_scale": MelType.HTK})
|
||||
except TypeError as error:
|
||||
logger.info("Got an exception in MFCC: {}".format(str(error)))
|
||||
assert "Argument onesided with value LianLinghang is not of type [<class 'bool'>], but got <class 'str'>." \
|
||||
in str(error)
|
||||
try:
|
||||
_ = audio.MFCC(melkwargs={"n_fft": 16, "win_length": 16, "hop_length": 8, "f_min": 0.0, "f_max": 10000.0,
|
||||
"pad": 0, "n_mels": 5, "window": WindowType.HANN, "power": 2.0, "normalized": True,
|
||||
"center": "XiaDanni", "pad_mode": BorderType.REFLECT, "onesided": False,
|
||||
"norm": NormType.NONE, "mel_scale": MelType.HTK})
|
||||
except TypeError as error:
|
||||
logger.info("Got an exception in MFCC: {}".format(str(error)))
|
||||
assert "Argument center with value XiaDanni is not of type [<class 'bool'>], but got <class 'str'>." \
|
||||
in str(error)
|
||||
try:
|
||||
_ = audio.MFCC(melkwargs={"n_fft": 16, "win_length": 16, "hop_length": 8, "f_min": 0.0, "f_max": 10000.0,
|
||||
"pad": 0, "n_mels": 5, "window": WindowType.HANN, "power": 2.0, "normalized": "s",
|
||||
"center": True, "pad_mode": BorderType.REFLECT, "onesided": False,
|
||||
"norm": NormType.NONE, "mel_scale": MelType.HTK})
|
||||
except TypeError as error:
|
||||
logger.info("Got an exception in MFCC: {}".format(str(error)))
|
||||
assert "Argument normalized with value s is not of type [<class 'bool'>], but got <class 'str'>." in str(error)
|
||||
try:
|
||||
_ = audio.MFCC(melkwargs={"n_fft": 16, "win_length": 16, "hop_length": 8, "f_min": 0.0, "f_max": 10000.0,
|
||||
"pad": 0, "n_mels": 5, "window": WindowType.HANN, "power": 2.0, "normalized": 1,
|
||||
"center": True, "pad_mode": BorderType.REFLECT, "onesided": "LianLinghang",
|
||||
"norm": NormType.NONE, "mel_scale": MelType.HTK})
|
||||
except TypeError as error:
|
||||
logger.info("Got an exception in MFCC: {}".format(str(error)))
|
||||
assert "Argument normalized with value 1 is not of type [<class 'bool'>], but got <class 'int'>." in str(error)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_mfcc_pipeline()
|
||||
test_mfcc_eager()
|
||||
test_mfcc_param()
|
Loading…
Reference in New Issue