!33270 [assistant][InverseMelScale]

Merge pull request !33270 from chenchen/InverseMelScale
This commit is contained in:
i-robot 2022-04-26 08:55:58 +00:00 committed by Gitee
commit fcb0319747
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
27 changed files with 1057 additions and 29 deletions

View File

@ -39,6 +39,7 @@
#include "minddata/dataset/audio/ir/kernels/gain_ir.h"
#include "minddata/dataset/audio/ir/kernels/griffin_lim_ir.h"
#include "minddata/dataset/audio/ir/kernels/highpass_biquad_ir.h"
#include "minddata/dataset/audio/ir/kernels/inverse_mel_scale_ir.h"
#include "minddata/dataset/audio/ir/kernels/lfilter_ir.h"
#include "minddata/dataset/audio/ir/kernels/lowpass_biquad_ir.h"
#include "minddata/dataset/audio/ir/kernels/magphase_ir.h"
@ -464,6 +465,47 @@ std::shared_ptr<TensorOperation> HighpassBiquad::Parse() {
return std::make_shared<HighpassBiquadOperation>(data_->sample_rate_, data_->cutoff_freq_, data_->Q_);
}
// InverseMelScale Transform Operation.
struct InverseMelScale::Data {
Data(int32_t n_stft, int32_t n_mels, int32_t sample_rate, float f_min, float f_max, int32_t max_iter,
float tolerance_loss, float tolerance_change, const std::map<std::string, float> &sgdargs, NormType norm,
MelType mel_type)
: n_stft_(n_stft),
n_mels_(n_mels),
sample_rate_(sample_rate),
f_min_(f_min),
f_max_(f_max),
max_iter_(max_iter),
tolerance_loss_(tolerance_loss),
tolerance_change_(tolerance_change),
sgdargs_(sgdargs),
norm_(norm),
mel_type_(mel_type) {}
int32_t n_stft_;
int32_t n_mels_;
int32_t sample_rate_;
float f_min_;
float f_max_;
int32_t max_iter_;
float tolerance_loss_;
float tolerance_change_;
std::map<std::string, float> sgdargs_;
NormType norm_;
MelType mel_type_;
};
InverseMelScale::InverseMelScale(int32_t n_stft, int32_t n_mels, int32_t sample_rate, float f_min, float f_max,
int32_t max_iter, float tolerance_loss, float tolerance_change,
const std::map<std::string, float> &sgdargs, NormType norm, MelType mel_type)
: data_(std::make_shared<Data>(n_stft, n_mels, sample_rate, f_min, f_max, max_iter, tolerance_loss,
tolerance_change, sgdargs, norm, mel_type)) {}
std::shared_ptr<TensorOperation> InverseMelScale::Parse() {
return std::make_shared<InverseMelScaleOperation>(
data_->n_stft_, data_->n_mels_, data_->sample_rate_, data_->f_min_, data_->f_max_, data_->max_iter_,
data_->tolerance_loss_, data_->tolerance_change_, data_->sgdargs_, data_->norm_, data_->mel_type_);
}
// LFilter Transform Operation.
struct LFilter::Data {
Data(const std::vector<float> &a_coeffs, const std::vector<float> &b_coeffs, bool clamp)

View File

@ -43,6 +43,7 @@
#include "minddata/dataset/audio/ir/kernels/gain_ir.h"
#include "minddata/dataset/audio/ir/kernels/griffin_lim_ir.h"
#include "minddata/dataset/audio/ir/kernels/highpass_biquad_ir.h"
#include "minddata/dataset/audio/ir/kernels/inverse_mel_scale_ir.h"
#include "minddata/dataset/audio/ir/kernels/lfilter_ir.h"
#include "minddata/dataset/audio/ir/kernels/lowpass_biquad_ir.h"
#include "minddata/dataset/audio/ir/kernels/magphase_ir.h"
@ -359,6 +360,20 @@ PYBIND_REGISTER(
}));
}));
PYBIND_REGISTER(InverseMelScaleOperation, 1, ([](const py::module *m) {
(void)py::class_<audio::InverseMelScaleOperation, TensorOperation,
std::shared_ptr<audio::InverseMelScaleOperation>>(*m, "InverseMelScaleOperation")
.def(py::init([](int32_t n_stft, int32_t n_mels, int32_t sample_rate, float f_min, float f_max,
int32_t max_iter, float tolerance_loss, float tolerance_change,
const py::dict &sgdargs, NormType norm, MelType mel_type) {
auto inverse_mel_scale = std::make_shared<audio::InverseMelScaleOperation>(
n_stft, n_mels, sample_rate, f_min, f_max, max_iter, tolerance_loss, tolerance_change,
toStringFloatMap(sgdargs), norm, mel_type);
THROW_IF_ERROR(inverse_mel_scale->ValidateParams());
return inverse_mel_scale;
}));
}));
PYBIND_REGISTER(LFilterOperation, 1, ([](const py::module *m) {
(void)py::class_<audio::LFilterOperation, TensorOperation, std::shared_ptr<audio::LFilterOperation>>(
*m, "LFilterOperation")

View File

@ -49,6 +49,16 @@ std::map<std::string, int32_t> toStringMap(const py::dict dict) {
return map;
}
std::map<std::string, float> toStringFloatMap(const py::dict dict) {
std::map<std::string, float> map;
if (!dict.empty()) {
for (auto p : dict) {
(void)map.emplace(toString(p.first), toFloat(p.second));
}
}
return map;
}
std::vector<std::string> toStringVector(const py::list list) {
std::vector<std::string> vector;
if (!list.empty()) {

View File

@ -55,6 +55,8 @@ std::set<std::string> toStringSet(const py::list list);
std::map<std::string, int32_t> toStringMap(const py::dict dict);
std::map<std::string, float> toStringFloatMap(const py::dict dict);
std::vector<std::string> toStringVector(const py::list list);
std::vector<pid_t> toIntVector(const py::list input_list);

View File

@ -25,6 +25,7 @@ add_library(audio-ir-kernels OBJECT
gain_ir.cc
griffin_lim_ir.cc
highpass_biquad_ir.cc
inverse_mel_scale_ir.cc
lfilter_ir.cc
lowpass_biquad_ir.cc
magphase_ir.cc

View File

@ -0,0 +1,88 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "minddata/dataset/audio/ir/kernels/inverse_mel_scale_ir.h"
#include "minddata/dataset/audio/ir/validators.h"
#include "minddata/dataset/audio/kernels/inverse_mel_scale_op.h"
namespace mindspore {
namespace dataset {
namespace audio {
// InverseMelScale
InverseMelScaleOperation::InverseMelScaleOperation(int32_t n_stft, int32_t n_mels, int32_t sample_rate, float f_min,
float f_max, int32_t max_iter, float tolerance_loss,
float tolerance_change, const std::map<std::string, float> &sgdargs,
NormType norm, MelType mel_type)
: n_stft_(n_stft),
n_mels_(n_mels),
sample_rate_(sample_rate),
f_min_(f_min),
f_max_(f_max),
max_iter_(max_iter),
tolerance_loss_(tolerance_loss),
tolerance_change_(tolerance_change),
sgdargs_(sgdargs),
norm_(norm),
mel_type_(mel_type) {
sgd_lr_ = sgdargs_.find("sgd_lr") == sgdargs_.end() ? 0.1 : sgdargs_["sgd_lr"];
constexpr float SGD_MOMENTUM_DEFAULT = 0.9;
sgd_momentum_ = sgdargs_.find("sgd_momentum") == sgdargs_.end() ? SGD_MOMENTUM_DEFAULT : sgdargs_["sgd_momentum"];
}
InverseMelScaleOperation::~InverseMelScaleOperation() = default;
std::string InverseMelScaleOperation::Name() const { return kInverseMelScaleOperation; }
Status InverseMelScaleOperation::ValidateParams() {
RETURN_IF_NOT_OK(ValidateIntScalarNonNegative("InverseMelScale", "n_mels", n_mels_));
RETURN_IF_NOT_OK(ValidateIntScalarNonNegative("InverseMelScale", "sample_rate", sample_rate_));
CHECK_FAIL_RETURN_UNEXPECTED(n_stft_ != 1,
"InverseMelScale: n_stft can not be equal to 1, but got: " + std::to_string(n_stft_));
RETURN_IF_NOT_OK(ValidateFloatScalarNonNegative("InverseMelScale", "f_max", f_max_));
CHECK_FAIL_RETURN_UNEXPECTED(f_min_ < f_max_, "InverseMelScale: f_max must be greater than f_min.");
// SGD params
RETURN_IF_NOT_OK(ValidateFloatScalarNonNegative("InverseMelScale", "sgd_lr", sgd_lr_));
RETURN_IF_NOT_OK(ValidateFloatScalarNonNegative("InverseMelScale", "sgd_momentum", sgd_momentum_));
return Status::OK();
}
std::shared_ptr<TensorOp> InverseMelScaleOperation::Build() {
std::shared_ptr<InverseMelScaleOp> tensor_op =
std::make_shared<InverseMelScaleOp>(n_stft_, n_mels_, sample_rate_, f_min_, f_max_, max_iter_, tolerance_loss_,
tolerance_change_, sgd_lr_, sgd_momentum_, norm_, mel_type_);
return tensor_op;
}
Status InverseMelScaleOperation::to_json(nlohmann::json *out_json) {
nlohmann::json args;
args["n_stft"] = n_stft_;
args["n_mels"] = n_mels_;
args["sample_rate"] = sample_rate_;
args["f_min"] = f_min_;
args["f_max"] = f_max_;
args["max_iter"] = max_iter_;
args["tolerance_loss"] = tolerance_loss_;
args["tolerance_change"] = tolerance_change_;
args["sgdargs"] = sgdargs_;
args["norm"] = norm_;
args["mel_type"] = mel_type_;
*out_json = args;
return Status::OK();
}
} // namespace audio
} // namespace dataset
} // namespace mindspore

View File

@ -0,0 +1,67 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_IR_KERNELS_INVERSE_MEL_SCALE_IR_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_IR_KERNELS_INVERSE_MEL_SCALE_IR_H_
#include <map>
#include <memory>
#include <string>
#include "include/api/status.h"
#include "minddata/dataset/include/dataset/constants.h"
#include "minddata/dataset/kernels/ir/tensor_operation.h"
namespace mindspore {
namespace dataset {
namespace audio {
constexpr char kInverseMelScaleOperation[] = "InverseMelScale";
class InverseMelScaleOperation : public TensorOperation {
public:
InverseMelScaleOperation(int32_t n_stft, int32_t n_mels, int32_t sample_rate, float f_min, float f_max,
int32_t max_iter, float tolerance_loss, float tolerance_change,
const std::map<std::string, float> &sgdargs, NormType norm, MelType mel_type);
~InverseMelScaleOperation();
std::shared_ptr<TensorOp> Build() override;
Status ValidateParams() override;
std::string Name() const override;
Status to_json(nlohmann::json *out_json) override;
private:
int32_t n_stft_;
int32_t n_mels_;
int32_t sample_rate_;
float f_min_;
float f_max_;
int32_t max_iter_;
float tolerance_loss_;
float tolerance_change_;
std::map<std::string, float> sgdargs_;
float sgd_lr_;
float sgd_momentum_;
NormType norm_;
MelType mel_type_;
};
} // namespace audio
} // namespace dataset
} // namespace mindspore
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_IR_KERNELS_INVERSE_MEL_SCALE_IR_H_

View File

@ -26,6 +26,7 @@ add_library(audio-kernels OBJECT
gain_op.cc
griffin_lim_op.cc
highpass_biquad_op.cc
inverse_mel_scale_op.cc
lfilter_op.cc
lowpass_biquad_op.cc
magphase_op.cc

View File

@ -2050,5 +2050,106 @@ Status GriffinLim(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor>
momentum, length, rand_init, rnd);
}
}
template <typename T>
Status InverseMelScaleImpl(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t n_stft,
int32_t n_mels, int32_t sample_rate, float f_min, float f_max, int32_t max_iter,
float tolerance_loss, float tolerance_change, float sgd_lr, float sgd_momentum,
NormType norm, MelType mel_type, std::mt19937 rnd) {
f_max = f_max == 0 ? static_cast<T>(std::floor(sample_rate / 2)) : f_max;
// create fb mat <freq, n_mels>
std::shared_ptr<Tensor> freq_bin_mat;
RETURN_IF_NOT_OK(CreateFbanks(&freq_bin_mat, n_stft, f_min, f_max, n_mels, sample_rate, norm, mel_type));
auto fb_ptr = &*freq_bin_mat->begin<float>();
Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>> matrix_fb(fb_ptr, n_mels, n_stft);
// pack melspec <n, n_mels, time>
TensorShape input_shape = input->shape();
TensorShape input_reshape({input->Size() / input_shape[-1] / input_shape[-2], input_shape[-2], input_shape[-1]});
RETURN_IF_NOT_OK(input->Reshape(input_reshape));
CHECK_FAIL_RETURN_UNEXPECTED(n_mels == input_shape[-1 * TWO],
"InverseMelScale: n_mels must be equal to the penultimate dimension of input.");
int time = input_shape[-1];
int freq = matrix_fb.cols();
// input matrix 3d
std::vector<T> specgram;
// engine for random matrix
std::uniform_real_distribution<T> dist(0, 1);
for (int channel = 0; channel < input_reshape[0]; channel++) {
// slice by first dimension
auto data_ptr = &*input->begin<T>();
Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> input_channel(data_ptr + time * n_mels * channel, time,
n_mels);
// init specgram at n=channel
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> mat_channel =
Eigen::MatrixXd::Zero(time, freq).unaryExpr([&rnd, &dist](double dummy) { return dist(rnd); });
std::vector<T> vec_channel(mat_channel.data(), mat_channel.data() + mat_channel.size());
std::shared_ptr<Tensor> param_channel;
TensorShape output_shape = TensorShape({freq, time});
RETURN_IF_NOT_OK(Tensor::CreateFromVector(vec_channel, TensorShape({freq * time}), &param_channel));
// sgd
T loss = std::numeric_limits<T>::max();
for (int epoch = 0; epoch < max_iter; epoch++) {
auto pred = mat_channel * (matrix_fb.transpose().template cast<T>());
// cal loss with pred and gt
auto diff = input_channel - pred;
T new_loss = diff.array().square().mean();
// cal grad
auto grad = diff * (matrix_fb.template cast<T>()) * (-1) / time;
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> mat_grad = grad;
std::vector<T> vec_grad(mat_grad.data(), mat_grad.data() + mat_grad.size());
std::shared_ptr<Tensor> tensor_grad;
RETURN_IF_NOT_OK(Tensor::CreateFromVector(vec_grad, TensorShape({grad.size()}), &tensor_grad));
std::shared_ptr<Tensor> nspec;
RETURN_IF_NOT_OK(SGD<T>(param_channel, &nspec, tensor_grad, sgd_lr, sgd_momentum));
T diff_loss = std::abs(loss - new_loss);
if ((new_loss < tolerance_loss) || (diff_loss < tolerance_change)) {
break;
}
loss = new_loss;
data_ptr = &*nspec->begin<T>();
mat_channel = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>(data_ptr, time, freq);
// use new mat_channel to update param_channel
RETURN_IF_NOT_OK(Tensor::CreateFromTensor(nspec, &param_channel));
}
// clamp and transpose
auto res = mat_channel.cwiseMax(0);
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> mat_res = res;
std::vector<T> spec_channel(mat_res.data(), mat_res.data() + mat_res.size());
specgram.insert(specgram.end(), spec_channel.begin(), spec_channel.end());
}
std::shared_ptr<Tensor> final_out;
if (input_shape.Size() > TWO) {
std::vector<int64_t> out_shape_vec = input_shape.AsVector();
out_shape_vec[input_shape.Size() - 1] = time;
out_shape_vec[input_shape.Size() - TWO] = freq;
TensorShape output_shape(out_shape_vec);
RETURN_IF_NOT_OK(Tensor::CreateFromVector(specgram, output_shape, &final_out));
} else {
TensorShape output_shape = TensorShape({input_reshape[0], freq, time});
RETURN_IF_NOT_OK(Tensor::CreateFromVector(specgram, output_shape, &final_out));
}
*output = final_out;
return Status::OK();
}
Status InverseMelScale(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t n_stft,
int32_t n_mels, int32_t sample_rate, float f_min, float f_max, int32_t max_iter,
float tolerance_loss, float tolerance_change, float sgd_lr, float sgd_momentum, NormType norm,
MelType mel_type, std::mt19937 rnd) {
std::shared_ptr<Tensor> input_tensor;
if (input->type() != DataType::DE_FLOAT64) {
RETURN_IF_NOT_OK(TypeCast(input, &input_tensor, DataType(DataType::DE_FLOAT32)));
return InverseMelScaleImpl<float>(input_tensor, output, n_stft, n_mels, sample_rate, f_min, f_max, max_iter,
tolerance_loss, tolerance_change, sgd_lr, sgd_momentum, norm, mel_type, rnd);
} else {
input_tensor = input;
return InverseMelScaleImpl<double>(input_tensor, output, n_stft, n_mels, sample_rate, f_min, f_max, max_iter,
tolerance_loss, tolerance_change, sgd_lr, sgd_momentum, norm, mel_type, rnd);
}
}
} // namespace dataset
} // namespace mindspore

View File

@ -501,6 +501,80 @@ Status Dct(std::shared_ptr<Tensor> *output, int32_t n_mfcc, int32_t n_mels, Norm
/// \return Status code.
Status ComplexNorm(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, float power);
/// \brief Stochastic gradient descent.
/// \param[in] input Input tensor.
/// \param[out] output Output tensor.
/// \param[in] grad Input grad for params.
/// \param[in] lr Learning rate.
/// \param[in] momentum Momentum factor.
/// \param[in] dampening Dampening for momentum.
/// \param[in] weight_decay Weight decay.
/// \param[in] nesterov Whether enable nesterov momentum.
/// \param[in] stat Stat.
/// \return Status code.
template <typename T>
Status SGD(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, const std::shared_ptr<Tensor> &grad,
float lr, float momentum = 0.0, float dampening = 0.0, float weight_decay = 0.0, bool nesterov = false,
float stat = 0.0) {
size_t elem_num = input->Size();
std::vector<T> accum(elem_num);
std::shared_ptr<Tensor> output_param;
std::vector<T> out_param(elem_num);
int ind = 0;
auto itr_inp = input->begin<T>();
auto itr_grad = grad->begin<T>();
while (itr_inp != input->end<T>() && itr_grad != grad->end<T>()) {
T grad_new = (*itr_grad);
if (weight_decay > static_cast<float>(0.0)) {
grad_new += (*itr_inp) * static_cast<T>(weight_decay);
}
if (momentum > 0) {
if (stat > 0) {
accum[ind] = grad_new;
stat = 0;
} else {
accum[ind] = accum[ind] * momentum + (1 - static_cast<T>(dampening)) * grad_new;
}
if (nesterov) {
grad_new += accum[ind] * momentum;
} else {
grad_new = accum[ind];
}
}
out_param[ind] = (*itr_inp) - lr * grad_new;
itr_inp++;
itr_grad++;
ind++;
}
RETURN_IF_NOT_OK(Tensor::CreateFromVector(out_param, TensorShape({input->Size()}), &output_param));
*output = output_param;
return Status::OK();
}
/// \brief Use conversion matrix to solve normal STFT from mel frequency STFT.
/// \param input Tensor of shape <..., n_mels, time>.
/// \param output Tensor of shape <..., freq, time>.
/// \param n_stft Number of bins in STFT, the value must be greater than 0.
/// \param n_mels Number of mel filter, the value must be greater than 0.
/// \param sample_rate Sample rate of the signal, the value can't be zero.
/// \param f_min Minimum frequency, the value must be greater than or equal to 0.
/// \param f_max Maximum frequency, the value must be greater than 0.
/// \param max_iter Maximum number of optimization iterations, the value must be greater than 0.
/// \param tolerance_loss Value of loss to stop optimization at, the value must be greater than or equal to 0.
/// \param tolerance_change Difference in losses to stop optimization at, the value must be greater than or equal to 0.
/// \param sgd_lr Learning rate for SGD optimizer, the value must be greater than or equal to 0.
/// \param sgd_momentum Momentum factor for SGD optimizer, the value must be greater than or equal to 0.
/// \param norm Type of norm, value should be NormType::kSlaney or NormType::kNone. If norm is NormType::kSlaney,
/// divide the triangle mel weight by the width of the mel band.
/// \param mel_type Type of mel, value should be MelType::kHtk or MelType::kSlaney.
/// \param rnd Random generator.
/// \return Status code.
Status InverseMelScale(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t n_stft,
int32_t n_mels, int32_t sample_rate, float f_min, float f_max, int32_t max_iter,
float tolerance_loss, float tolerance_change, float sgd_lr, float sgd_momentum, NormType norm,
MelType mel_type, std::mt19937 rnd);
/// \brief Decode mu-law encoded signal.
/// \param input Tensor of shape <..., time>.
/// \param output Tensor of shape <..., time>.

View File

@ -0,0 +1,59 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "minddata/dataset/audio/kernels/inverse_mel_scale_op.h"
#include "minddata/dataset/audio/kernels/audio_utils.h"
#include "minddata/dataset/kernels/data/data_utils.h"
#include "minddata/dataset/util/status.h"
namespace mindspore {
namespace dataset {
Status InverseMelScaleOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
// check and init
IO_CHECK(input, output);
// check input dimension, it should be greater than 0
RETURN_IF_NOT_OK(ValidateLowRank("InverseMelScale", input, kDefaultAudioDim, "<..., freq, time>"));
// check input type, it should be [int, float, double]
RETURN_IF_NOT_OK(ValidateTensorNumeric("InverseMelScale", input));
return InverseMelScale(input, output, n_stft_, n_mels_, sample_rate_, f_min_, f_max_, max_iter_, tolerance_loss_,
tolerance_change_, sgd_lr_, sgd_momentum_, norm_, mel_type_, rnd_);
}
Status InverseMelScaleOp::OutputShape(const std::vector<TensorShape> &inputs, std::vector<TensorShape> &outputs) {
RETURN_IF_NOT_OK(TensorOp::OutputShape(inputs, outputs));
outputs.clear();
auto input_size = inputs[0].AsVector();
input_size.pop_back();
TensorShape out = TensorShape(input_size);
outputs.emplace_back(out);
if (!outputs.empty()) return Status::OK();
return Status(StatusCode::kMDUnexpectedError, "InverseMelScale: invalid input shape.");
}
Status InverseMelScaleOp::OutputType(const std::vector<DataType> &inputs, std::vector<DataType> &outputs) {
RETURN_IF_NOT_OK(TensorOp::OutputType(inputs, outputs));
RETURN_IF_NOT_OK(
ValidateTensorType("InverseMelScale", inputs[0].IsNumeric(), "[int, float, double]", inputs[0].ToString()));
if (inputs[0] == DataType(DataType::DE_FLOAT64)) {
outputs[0] = DataType(DataType::DE_FLOAT64);
} else {
outputs[0] = DataType(DataType::DE_FLOAT32);
}
return Status::OK();
}
} // namespace dataset
} // namespace mindspore

View File

@ -0,0 +1,79 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_INVERSE_MEL_SCALE_OP_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_INVERSE_MEL_SCALE_OP_H_
#include <memory>
#include <random>
#include <string>
#include <vector>
#include "include/dataset/constants.h"
#include "minddata/dataset/core/tensor.h"
#include "minddata/dataset/kernels/tensor_op.h"
#include "minddata/dataset/util/random.h"
namespace mindspore {
namespace dataset {
class InverseMelScaleOp : public TensorOp {
public:
InverseMelScaleOp(int32_t n_stft, int32_t n_mels, int32_t sample_rate, float f_min, float f_max, int32_t max_iter,
float tolerance_loss, float tolerance_change, float sgd_lr, float sgd_momentum, NormType norm,
MelType mel_type)
: n_stft_(n_stft),
n_mels_(n_mels),
sample_rate_(sample_rate),
f_min_(f_min),
f_max_(f_max),
max_iter_(max_iter),
tolerance_loss_(tolerance_loss),
tolerance_change_(tolerance_change),
sgd_lr_(sgd_lr),
sgd_momentum_(sgd_momentum),
norm_(norm),
mel_type_(mel_type) {
rnd_.seed(GetSeed());
}
~InverseMelScaleOp() override = default;
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
std::string Name() const override { return kInverseMelScaleOp; }
Status OutputShape(const std::vector<TensorShape> &inputs, std::vector<TensorShape> &outputs) override;
Status OutputType(const std::vector<DataType> &inputs, std::vector<DataType> &outputs) override;
private:
int32_t n_stft_;
int32_t n_mels_;
int32_t sample_rate_;
float f_min_;
float f_max_;
int32_t max_iter_;
float tolerance_loss_;
float tolerance_change_;
float sgd_lr_;
float sgd_momentum_;
NormType norm_;
MelType mel_type_;
std::mt19937 rnd_;
};
} // namespace dataset
} // namespace mindspore
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_INVERSE_MEL_SCALE_OP_H_

View File

@ -18,6 +18,7 @@
#define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DATASET_AUDIO_H_
#include <limits>
#include <map>
#include <memory>
#include <string>
#include <utility>
@ -585,6 +586,43 @@ class MS_API HighpassBiquad final : public TensorTransform {
std::shared_ptr<Data> data_;
};
/// \brief InverseMelScale TensorTransform
/// \notes Solve for a normal STFT from a mel frequency STFT, using a conversion matrix.
class MS_API InverseMelScale final : public TensorTransform {
public:
/// \brief Constructor.
/// \param[in] n_stft Number of bins in STFT, must be positive.
/// \param[in] n_mels Number of mel filter, must be positive (Default: 128).
/// \param[in] sample_rate Sample rate of the signal, the value can't be zero (Default: 16000).
/// \param[in] f_min Minimum frequency, must be non-negative (Default: 0.0).
/// \param[in] f_max Maximum frequency, must be non-negative (Default: 0.0, will be set to sample_rate / 2).
/// \param[in] max_iter Maximum number of optimization iterations, must be positive (Default: 100000).
/// \param[in] tolerance_loss Value of loss to stop optimization at, must be non-negative (Default: 1e-5).
/// \param[in] tolerance_change Difference in losses to stop optimization at, must be non-negative (Default: 1e-8).
/// \param[in] sgdargs Parameters of SGD optimizer, including lr, momentum
/// (Default: {{"sgd_lr", 0.1}, {"sgd_momentum", 0.0}}).
/// \param[in] norm Type of norm, value should be NormType::kSlaney or NormType::kNone. If norm is NormType::kSlaney,
/// divide the triangle mel weight by the width of the mel band (Default: NormType::kNone).
/// \param[in] mel_type Type of mel, value should be MelType::kHtk or MelType::kSlaney (Default: MelType::kHtk).
explicit InverseMelScale(int32_t n_stft, int32_t n_mels = 128, int32_t sample_rate = 16000, float f_min = 0.0,
float f_max = 0.0, int32_t max_iter = 100000, float tolerance_loss = 1e-5,
float tolerance_change = 1e-8,
const std::map<std::string, float> &sgdargs = {{"sgd_lr", 0.1}, {"sgd_momentum", 0.0}},
NormType norm = NormType::kNone, MelType mel_type = MelType::kHtk);
/// \brief Destructor.
~InverseMelScale() = default;
protected:
/// \brief Function to convert TensorTransform object into a TensorOperation object.
/// \return Shared pointer to TensorOperation object.
std::shared_ptr<TensorOperation> Parse() override;
private:
struct Data;
std::shared_ptr<Data> data_;
};
/// \brief Design filter. Similar to SoX implementation.
class MS_API LFilter final : public TensorTransform {
public:

View File

@ -170,6 +170,7 @@ constexpr char kFrequencyMaskingOp[] = "FrequencyMaskingOp";
constexpr char kGainOp[] = "GainOp";
constexpr char kGriffinLimOp[] = "GriffinLimOp";
constexpr char kHighpassBiquadOp[] = "HighpassBiquadOp";
constexpr char kInverseMelScaleOp[] = "InverseMelScaleOp";
constexpr char kLFilterOp[] = "LFilterOp";
constexpr char kLowpassBiquadOp[] = "LowpassBiquadOp";
constexpr char kMagphaseOp[] = "MagphaseOp";

View File

@ -29,10 +29,10 @@ from .validators import check_allpass_biquad, check_amplitude_to_db, check_band_
check_bandreject_biquad, check_bass_biquad, check_biquad, check_complex_norm, check_compute_deltas, \
check_contrast, check_db_to_amplitude, check_dc_shift, check_deemph_biquad, check_detect_pitch_frequency, \
check_dither, check_equalizer_biquad, check_fade, check_flanger, check_gain, check_griffin_lim, \
check_highpass_biquad, check_lfilter, check_lowpass_biquad, check_magphase, check_mask_along_axis, \
check_mask_along_axis_iid, check_masking, check_mel_scale, check_mu_law_coding, check_overdrive, \
check_phase_vocoder, check_phaser, check_riaa_biquad, check_sliding_window_cmn, check_spectral_centroid, \
check_spectrogram, check_time_stretch, check_treble_biquad, check_vol
check_highpass_biquad, check_inverse_mel_scale, check_lfilter, check_lowpass_biquad, check_magphase, \
check_mask_along_axis, check_mask_along_axis_iid, check_masking, check_mel_scale, check_mu_law_coding, \
check_overdrive, check_phase_vocoder, check_phaser, check_riaa_biquad, check_sliding_window_cmn, \
check_spectral_centroid, check_spectrogram, check_time_stretch, check_treble_biquad, check_vol
class AudioTensorOperation(TensorOperation):
@ -1010,6 +1010,58 @@ class HighpassBiquad(AudioTensorOperation):
return cde.HighpassBiquadOperation(self.sample_rate, self.cutoff_freq, self.Q)
class InverseMelScale(AudioTensorOperation):
"""
Solve for a normal STFT form a mel frequency STFT, using a conversion matrix.
Args:
n_stft (int): Number of bins in STFT.
n_mels (int, optional): Number of mel filterbanks (default=128).
sample_rate (int, optional): Sample rate of audio signal (default=16000).
f_min (float, optional): Minimum frequency (default=0.0).
f_max (float, optional): Maximum frequency (default=None, will be set to sample_rate // 2).
max_iter (int, optional): Maximum number of optimization iterations (default=100000).
tolerance_loss (float, optional): Value of loss to stop optimization at (default=1e-5).
tolerance_change (float, optional): Difference in losses to stop optimization at (default=1e-8).
sgdargs (dict, optional): Arguments for the SGD optimizer (default=None, will be set to
{'sgd_lr': 0.1, 'sgd_momentum': 0.9}).
norm (NormType, optional): Normalization method, can be NormType.SLANEY or NormType.NONE
(default=NormType.NONE).
mel_type (MelType, optional): Mel scale to use, can be MelType.SLANEY or MelType.HTK (default=MelType.HTK).
Examples:
>>> import numpy as np
>>>
>>> waveform = np.random.randn(2, 2, 3, 2)
>>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
>>> transforms = [audio.InverseMelScale(20, 3, 16000, 0, 8000, 10)]
>>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
"""
@check_inverse_mel_scale
def __init__(self, n_stft, n_mels=128, sample_rate=16000, f_min=0.0, f_max=None, max_iter=100000,
tolerance_loss=1e-5, tolerance_change=1e-8, sgdargs=None, norm=NormType.NONE, mel_type=MelType.HTK):
self.n_stft = n_stft
self.n_mels = n_mels
self.sample_rate = sample_rate
self.f_min = f_min
self.f_max = f_max if f_max is not None else sample_rate // 2
self.max_iter = max_iter
self.tolerance_loss = tolerance_loss
self.tolerance_change = tolerance_change
if sgdargs is None:
self.sgdargs = {'sgd_lr': 0.1, 'sgd_momentum': 0.9}
else:
self.sgdargs = sgdargs
self.norm = norm
self.mel_type = mel_type
def parse(self):
return cde.InverseMelScaleOperation(self.n_stft, self.n_mels, self.sample_rate, self.f_min, self.f_max,
self.max_iter, self.tolerance_loss, self.tolerance_change, self.sgdargs,
DE_C_NORM_TYPE[self.norm], DE_C_MEL_TYPE[self.mel_type])
class LFilter(AudioTensorOperation):
"""
Design two-pole filter for audio waveform of dimension of (..., time).

View File

@ -286,6 +286,87 @@ def check_gain(method):
return new_method
def check_mel_scale_n_mels(n_mels):
"""Wrapper method to check the parameters of n_mels."""
type_check(n_mels, (int,), "n_mels")
check_pos_int32(n_mels, "n_mels")
def check_mel_scale_sample_rate(sample_rate):
"""Wrapper method to check the parameters of sample_rate."""
type_check(sample_rate, (int,), "sample_rate")
check_pos_int32(sample_rate, "sample_rate")
def check_mel_scale_freq(f_min, f_max, sample_rate):
"""Wrapper method to check the parameters of f_min and f_max."""
type_check(f_min, (int, float), "f_min")
check_float32(f_min, "f_min")
if f_max is not None:
type_check(f_max, (int, float), "f_max")
check_pos_float32(f_max, "f_max")
if f_min >= f_max:
raise ValueError("MelScale: f_max should be greater than f_min.")
else:
if f_min >= sample_rate // 2:
raise ValueError("MelScale: sample_rate // 2 should be greater than f_min when f_max is set to None.")
def check_mel_scale_n_stft(n_stft):
"""Wrapper method to check the parameters of n_stft."""
type_check(n_stft, (int,), "n_stft")
check_pos_int32(n_stft, "n_stft")
def check_mel_scale_norm(norm):
"""Wrapper method to check the parameters of norm."""
type_check(norm, (NormType,), "norm")
def check_mel_scale_mel_type(mel_type):
"""Wrapper method to check the parameters of mel_type."""
type_check(mel_type, (MelType,), "mel_type")
def check_inverse_mel_scale(method):
"""Wrapper method to check the parameters of InverseMelScale."""
@wraps(method)
def new_method(self, *args, **kwargs):
[n_stft, n_mels, sample_rate, f_min, f_max, max_iter, tolerance_loss, tolerance_change, sgdargs, norm,
mel_type], _ = parse_user_args(method, *args, **kwargs)
check_mel_scale_n_mels(n_mels)
check_mel_scale_sample_rate(sample_rate)
check_mel_scale_freq(f_min, f_max, sample_rate)
check_mel_scale_n_stft(n_stft)
check_mel_scale_norm(norm)
check_mel_scale_mel_type(mel_type)
type_check(max_iter, (int,), "max_iter")
check_pos_int32(max_iter, "max_iter")
type_check(tolerance_loss, (int, float), "tolerance_loss")
check_pos_float32(tolerance_loss, "tolerance_loss")
type_check(tolerance_change, (int, float), "tolerance_change")
check_pos_float32(tolerance_change, "tolerance_change")
if sgdargs is not None:
sgd_lr = sgdargs["sgd_lr"]
sgd_momentum = sgdargs["sgd_momentum"]
type_check(sgd_lr, (int, float), "sgd_lr")
check_non_negative_float32(sgd_lr, "sgd_lr")
type_check(sgd_momentum, (int, float), "sgd_momentum")
check_non_negative_float32(sgd_momentum, "sgd_momentum")
return method(self, *args, **kwargs)
return new_method
def check_lfilter(method):
"""Wrapper method to check the parameters of LFilter."""
@ -519,31 +600,12 @@ def check_mel_scale(method):
@wraps(method)
def new_method(self, *args, **kwargs):
[n_mels, sample_rate, f_min, f_max, n_stft, norm, mel_type], _ = parse_user_args(method, *args, **kwargs)
type_check(n_mels, (int,), "n_mels")
check_pos_int32(n_mels, "n_mels")
type_check(sample_rate, (int,), "sample_rate")
check_pos_int32(sample_rate, "sample_rate")
type_check(f_min, (int, float), "f_min")
check_float32(f_min, "f_min")
if f_max is not None:
type_check(f_max, (int, float), "f_max")
check_pos_float32(f_max, "f_max")
if f_min >= f_max:
raise ValueError("MelScale: f_max should be greater than f_min.")
else:
if f_min >= sample_rate // 2:
raise ValueError("MelScale: sample_rate // 2 should be greater than f_min when f_max is set to None.")
type_check(n_stft, (int,), "n_stft")
check_pos_int32(n_stft, "n_stft")
type_check(norm, (NormType,), "norm")
type_check(mel_type, (MelType,), "mel_type")
check_mel_scale_n_mels(n_mels)
check_mel_scale_sample_rate(sample_rate)
check_mel_scale_freq(f_min, f_max, sample_rate)
check_mel_scale_n_stft(n_stft)
check_mel_scale_norm(norm)
check_mel_scale_mel_type(mel_type)
return method(self, *args, **kwargs)

View File

@ -936,6 +936,164 @@ TEST_F(MindDataTestPipeline, TestHighpassBiquadWrongArgs) {
EXPECT_EQ(iter02, nullptr);
}
/// Feature: InverseMelScale
/// Description: test basic usage of InverseMelScale
/// Expectation: get correct number of data
TEST_F(MindDataTestPipeline, TestInverseMelScalePipeline) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestInverseMelScalePipeline.";
// Original waveform
std::shared_ptr<SchemaObj> schema = Schema();
ASSERT_OK(schema->add_column("waveform", mindspore::DataType::kNumberTypeFloat32, {4, 3, 7}));
std::shared_ptr<Dataset> ds = RandomData(10, schema);
EXPECT_NE(ds, nullptr);
ds = ds->SetNumWorkers(4);
EXPECT_NE(ds, nullptr);
auto inverse_mel_scale_op1 = audio::InverseMelScale(20, 3, 16000, 0, 8000, 10);
ds = ds->Map({inverse_mel_scale_op1});
EXPECT_NE(ds, nullptr);
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(ds, nullptr);
std::unordered_map<std::string, mindspore::MSTensor> row;
ASSERT_OK(iter->GetNextRow(&row));
std::vector<int64_t> expected = {4, 20, 7};
int i = 0;
while (row.size() != 0) {
auto col = row["waveform"];
ASSERT_EQ(col.Shape(), expected);
ASSERT_EQ(col.Shape().size(), 3);
ASSERT_EQ(col.DataType(), mindspore::DataType::kNumberTypeFloat32);
ASSERT_OK(iter->GetNextRow(&row));
i++;
}
EXPECT_EQ(i, 10);
iter->Stop();
std::shared_ptr<SchemaObj> schema2 = Schema();
ASSERT_OK(schema2->add_column("waveform", mindspore::DataType::kNumberTypeFloat64, {10, 20, 30}));
ds = RandomData(10, schema2);
EXPECT_NE(ds, nullptr);
auto inverse_mel_scale_op2 = audio::InverseMelScale(128, 20, 16000, 0, 8000, 100);
ds = ds->Map({inverse_mel_scale_op2});
EXPECT_NE(ds, nullptr);
iter = ds->CreateIterator();
EXPECT_NE(ds, nullptr);
ASSERT_OK(iter->GetNextRow(&row));
expected = {10, 128, 30};
i = 0;
while (row.size() != 0) {
auto col = row["waveform"];
ASSERT_EQ(col.Shape(), expected);
ASSERT_EQ(col.Shape().size(), 3);
ASSERT_EQ(col.DataType(), mindspore::DataType::kNumberTypeFloat64);
ASSERT_OK(iter->GetNextRow(&row));
i++;
}
EXPECT_EQ(i, 10);
iter->Stop();
std::shared_ptr<SchemaObj> schema3 = Schema();
ASSERT_OK(schema3->add_column("waveform", mindspore::DataType::kNumberTypeInt16, {3, 4, 5}));
ds = RandomData(10, schema3);
EXPECT_NE(ds, nullptr);
auto inverse_mel_scale_op3 = audio::InverseMelScale(128, 4, 16000, 0, 8000, 100);
ds = ds->Map({inverse_mel_scale_op3});
EXPECT_NE(ds, nullptr);
iter = ds->CreateIterator();
EXPECT_NE(ds, nullptr);
ASSERT_OK(iter->GetNextRow(&row));
expected = {3, 128, 5};
i = 0;
while (row.size() != 0) {
auto col = row["waveform"];
ASSERT_EQ(col.Shape(), expected);
ASSERT_EQ(col.Shape().size(), 3);
ASSERT_EQ(col.DataType(), mindspore::DataType::kNumberTypeFloat32);
ASSERT_OK(iter->GetNextRow(&row));
i++;
}
EXPECT_EQ(i, 10);
iter->Stop();
std::shared_ptr<SchemaObj> schema4 = Schema();
ASSERT_OK(schema4->add_column("waveform", mindspore::DataType::kNumberTypeInt16, {4, 20}));
ds = RandomData(10, schema4);
EXPECT_NE(ds, nullptr);
auto inverse_mel_scale_op4 = audio::InverseMelScale(20, 4, 16000, 0, 8000, 100);
ds = ds->Map({inverse_mel_scale_op4});
EXPECT_NE(ds, nullptr);
iter = ds->CreateIterator();
EXPECT_NE(ds, nullptr);
ASSERT_OK(iter->GetNextRow(&row));
expected = {1, 20, 20};
i = 0;
while (row.size() != 0) {
auto col = row["waveform"];
ASSERT_EQ(col.Shape(), expected);
ASSERT_EQ(col.Shape().size(), 3);
ASSERT_EQ(col.DataType(), mindspore::DataType::kNumberTypeFloat32);
ASSERT_OK(iter->GetNextRow(&row));
i++;
}
EXPECT_EQ(i, 10);
iter->Stop();
}
/// Feature: InverseMelScale
/// Description: test WrongArg of InverseMelScale
/// Expectation: return error
TEST_F(MindDataTestPipeline, TestInverseMelScaleWrongArgs) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestInverseMelScaleWrongArgs.";
// MelScale: f_max must be greater than f_min.
std::shared_ptr<SchemaObj> schema = Schema();
ASSERT_OK(schema->add_column("waveform", mindspore::DataType::kNumberTypeFloat32, {3, 4, 5}));
std::shared_ptr<Dataset> ds = RandomData(50, schema);
EXPECT_NE(ds, nullptr);
ds = ds->SetNumWorkers(4);
EXPECT_NE(ds, nullptr);
auto inverse_mel_scale_op = audio::InverseMelScale(128, 4, 1000, -100, -100);
ds = ds->Map({inverse_mel_scale_op});
EXPECT_NE(ds, nullptr);
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_EQ(iter, nullptr);
// MelScale: n_mels must be greater than 0.
inverse_mel_scale_op = audio::InverseMelScale(-128, 16000, 1000, 10, 100);
ds = ds->Map({inverse_mel_scale_op});
EXPECT_NE(ds, nullptr);
iter = ds->CreateIterator();
EXPECT_EQ(iter, nullptr);
// MelScale: sample_rate must be greater than f_min.
inverse_mel_scale_op = audio::InverseMelScale(128, -16000, 1000, 10, 100);
ds = ds->Map({inverse_mel_scale_op});
EXPECT_NE(ds, nullptr);
iter = ds->CreateIterator();
EXPECT_EQ(iter, nullptr);
// MelScale: max_iter must be greater than 0.
inverse_mel_scale_op = audio::InverseMelScale(128, 16000, 1000, 10, 100, -10);
ds = ds->Map({inverse_mel_scale_op});
EXPECT_NE(ds, nullptr);
iter = ds->CreateIterator();
EXPECT_EQ(iter, nullptr);
// MelScale: tolerance_loss must be greater than 0.
inverse_mel_scale_op = audio::InverseMelScale(128, 16000, 1000, 10, 100, 10, -10);
ds = ds->Map({inverse_mel_scale_op});
EXPECT_NE(ds, nullptr);
iter = ds->CreateIterator();
EXPECT_EQ(iter, nullptr);
// MelScale: tolerance_change must be greater than 0.
inverse_mel_scale_op = audio::InverseMelScale(128, 16000, 1000, 10, 100, 10, 10, -10);
ds = ds->Map({inverse_mel_scale_op});
EXPECT_NE(ds, nullptr);
iter = ds->CreateIterator();
EXPECT_EQ(iter, nullptr);
}
/// Feature: MelscaleFbanks.
/// Description: Test normal operation.
/// Expectation: As expected.

View File

@ -1000,6 +1000,29 @@ TEST_F(MindDataTestExecute, TestHighpassBiquadParamCheckSampleRate) {
ASSERT_FALSE(rc.IsOk());
}
// Feature: InverseMelScale
// Description: test InverseMelScale in eager mode
// Expectation: the data is processed successfully
TEST_F(MindDataTestExecute, TestInverseMelScale) {
MS_LOG(INFO) << "Doing MindDataTestExecute-TestInverseMelScale.";
// Original waveform
std::vector<float> labels = {
2.716064453125000000e-03, 6.347656250000000000e-03, 9.246826171875000000e-03, 1.089477539062500000e-02,
1.138305664062500000e-02, 1.156616210937500000e-02, 1.394653320312500000e-02, 1.550292968750000000e-02,
1.614379882812500000e-02, 1.840209960937500000e-02, 1.718139648437500000e-02, 1.599121093750000000e-02,
1.647949218750000000e-02, 1.510620117187500000e-02, 1.385498046875000000e-02, 1.345825195312500000e-02,
1.419067382812500000e-02, 1.284790039062500000e-02, 1.052856445312500000e-02, 9.368896484375000000e-03,
1.419067382812500000e-02, 1.284790039062500000e-02, 1.052856445312500000e-02, 9.368896484375000000e-03};
std::shared_ptr<Tensor> input;
ASSERT_OK(Tensor::CreateFromVector(labels, TensorShape({2, 2, 3, 2}), &input));
auto input_ms = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(input));
std::shared_ptr<TensorTransform> inverse_mel_op = std::make_shared<audio::InverseMelScale>(20, 3, 16000, 0, 8000, 10);
// apply inverse mel scale
mindspore::dataset::Execute trans({inverse_mel_op});
Status status = trans(input_ms, &input_ms);
EXPECT_TRUE(status.IsOk());
}
TEST_F(MindDataTestExecute, TestMuLawDecodingEager) {
MS_LOG(INFO) << "Doing MindDataTestExecute-TestMuLawDecodingEager.";
// testing

View File

@ -0,0 +1,155 @@
# Copyright 2022 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Testing InverseMelScale op in DE
"""
import numpy as np
import pytest
import mindspore.dataset as ds
import mindspore.dataset.audio.transforms as c_audio
from mindspore import log as logger
from mindspore.dataset.audio.utils import MelType, NormType
DATA_DIR = "../data/dataset/audiorecord/"
def get_ratio(mat):
return mat.sum() / mat.size
def test_inverse_mel_scale_pipeline():
"""
Feature: InverseMelScale
Description: test InverseMelScale cpp op in pipeline
Expectation: equal results from Mindspore and benchmark
"""
in_data = np.load(DATA_DIR + "inverse_mel_scale_8x40.npy")[np.newaxis, :]
out_expect = np.load(DATA_DIR + 'inverse_mel_scale_20x40_out.npy')[np.newaxis, :]
dataset = ds.NumpySlicesDataset(in_data, column_names=["multi_dimensional_data"], shuffle=False)
transforms = [c_audio.InverseMelScale(n_stft=20, n_mels=8, sample_rate=8000,
sgdargs={'sgd_lr': 0.05, 'sgd_momentum': 0.9})]
dataset = dataset.map(operations=transforms, input_columns=["multi_dimensional_data"])
for item in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
out_data = item["multi_dimensional_data"]
epsilon = 1e-60
relative_diff = np.abs((out_data - out_expect) / (out_expect + epsilon))
assert get_ratio(relative_diff < 1e-1) > 1e-2
in_data = np.load(DATA_DIR + "inverse_mel_scale_4x80.npy")[np.newaxis, :]
out_expect = np.load(DATA_DIR + 'inverse_mel_scale_40x80_out.npy')[np.newaxis, :]
dataset = ds.NumpySlicesDataset(in_data, column_names=["multi_dimensional_data"], shuffle=False)
transforms = [c_audio.InverseMelScale(n_stft=40, n_mels=4,
sgdargs={'sgd_lr': 0.01, 'sgd_momentum': 0.9})]
dataset = dataset.map(operations=transforms, input_columns=["multi_dimensional_data"])
for item in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
out_data = item["multi_dimensional_data"]
epsilon = 1e-60
relative_diff = np.abs((out_data - out_expect) / (out_expect + epsilon))
assert get_ratio(relative_diff < 1e-1) > 1e-2
in_data = np.load(DATA_DIR + "inverse_mel_scale_4x160.npy")[np.newaxis, :]
out_expect = np.load(DATA_DIR + 'inverse_mel_scale_40x160_out.npy')[np.newaxis, :]
dataset = ds.NumpySlicesDataset(in_data, column_names=["multi_dimensional_data"], shuffle=False)
transforms = [c_audio.InverseMelScale(n_stft=40, n_mels=4, f_min=10,
sgdargs={'sgd_lr': 0.1, 'sgd_momentum': 0.8})]
dataset = dataset.map(operations=transforms, input_columns=["multi_dimensional_data"])
for item in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
out_data = item["multi_dimensional_data"]
epsilon = 1e-60
relative_diff = np.abs((out_data - out_expect) / (out_expect + epsilon))
assert get_ratio(relative_diff < 1e-1) > 1e-2
def test_inverse_mel_scale_pipeline_invalid_param():
"""
Feature: InverseMelScale
Description: test InverseMelScale with invalid input parameters
Expectation: throw ValueError or TypeError
"""
logger.info("test InverseMelScale op with default values")
in_data = np.load(DATA_DIR + "inverse_mel_scale_32x81.npy")[np.newaxis, :]
data1 = ds.GeneratorDataset(in_data, column_names=["multi_dimensional_data"])
# f_min and f_max
with pytest.raises(ValueError,
match="MelScale: f_max should be greater than f_min."):
transforms = [c_audio.InverseMelScale(n_mels=20, n_stft=128, sample_rate=16200, f_min=1000, f_max=1000)]
data1 = data1.map(operations=transforms, input_columns=["multi_dimensional_data"])
for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True):
_ = item["multi_dimensional_data"]
# n_mel
with pytest.raises(ValueError, match=r"Input n_mels is not within the required interval of \[1, 2147483647\]."):
transforms = [c_audio.InverseMelScale(n_mels=-1, n_stft=2000, sample_rate=16200, f_min=10, f_max=1000)]
data1 = data1.map(operations=transforms, input_columns=["multi_dimensional_data"])
# sample_rate
with pytest.raises(ValueError,
match=r"Input sample_rate is not within the required interval of \[1, 2147483647\]."):
transforms = [c_audio.InverseMelScale(n_mels=128, n_stft=2000, sample_rate=0, f_min=10, f_max=1000)]
data1 = data1.map(operations=transforms, input_columns=["multi_dimensional_data"])
# f_max
with pytest.raises(ValueError, match=r"Input f_max is not within the required interval of \(0, 16777216\]."):
transforms = [c_audio.InverseMelScale(n_mels=128, n_stft=2000, sample_rate=16200, f_min=10, f_max=-10)]
data1 = data1.map(operations=transforms, input_columns=["multi_dimensional_data"])
# norm
with pytest.raises(TypeError, match=r"Argument norm with value slaney is not of type \[<enum 'NormType'>\], " +
"but got <class 'str'>."):
transforms = [c_audio.InverseMelScale(n_mels=128, n_stft=2000, sample_rate=16200, f_min=10,
f_max=1000, norm="slaney", mel_type=MelType.SLANEY)]
data1 = data1.map(operations=transforms, input_columns=["multi_dimensional_data"])
# mel_type
with pytest.raises(TypeError, match=r"Argument mel_type with value SLANEY is not of type \[<enum 'MelType'>\], " +
"but got <class 'str'>."):
transforms = [c_audio.InverseMelScale(n_mels=128, n_stft=2000, sample_rate=16200, f_min=10, f_max=1000,
norm=NormType.NONE, mel_type="SLANEY")]
data1 = data1.map(operations=transforms, input_columns=["multi_dimensional_data"])
# max_iter
with pytest.raises(ValueError, match=r"Input max_iter is not within the required interval of \[1, 2147483647\]."):
transforms = [c_audio.InverseMelScale(n_mels=128, n_stft=2000, sample_rate=16200, f_min=10, f_max=1000,
norm=NormType.NONE, mel_type=MelType.SLANEY, max_iter=-10)]
data1 = data1.map(operations=transforms, input_columns=["multi_dimensional_data"])
# tolerance_loss
with pytest.raises(ValueError,
match=r"Input tolerance_loss is not within the required interval of \(0, 16777216\]."):
transforms = [c_audio.InverseMelScale(n_mels=128, n_stft=2000, sample_rate=16200, f_min=10, f_max=1000,
norm=NormType.NONE, mel_type=MelType.SLANEY, tolerance_loss=-10)]
data1 = data1.map(operations=transforms, input_columns=["multi_dimensional_data"])
# tolerance_change
with pytest.raises(ValueError,
match=r"Input tolerance_change is not within the required interval of \(0, 16777216\]."):
transforms = [c_audio.InverseMelScale(n_mels=128, n_stft=2000, sample_rate=16200, f_min=10, f_max=1000,
norm=NormType.NONE, mel_type=MelType.SLANEY, tolerance_change=-10)]
data1 = data1.map(operations=transforms, input_columns=["multi_dimensional_data"])
def test_inverse_mel_scale_eager():
"""
Feature: InverseMelScale
Description: test InverseMelScale cpp op with eager mode
Expectation: equal results from Mindspore and benchmark
"""
spectrogram = np.load(DATA_DIR + 'inverse_mel_scale_32x81.npy')
out_ms = c_audio.InverseMelScale(n_stft=80, n_mels=32)(spectrogram)
out_expect = np.load(DATA_DIR + 'inverse_mel_scale_80x81_out.npy')
epsilon = 1e-60
relative_diff = np.abs((out_ms - out_expect) / (out_expect + epsilon))
assert get_ratio(relative_diff < 1e-1) > 1e-2
assert get_ratio(relative_diff < 1e-3) > 1e-3
if __name__ == "__main__":
test_inverse_mel_scale_pipeline()
test_inverse_mel_scale_pipeline_invalid_param()
test_inverse_mel_scale_eager()