forked from mindspore-Ecosystem/mindspore
!17631 [assistant][FrequencyMasking]
Merge pull request !17631 from QingfengLi/FreqMask
This commit is contained in:
commit
d1112f8e9c
|
@ -23,6 +23,7 @@
|
|||
#include "minddata/dataset/audio/ir/kernels/bandpass_biquad_ir.h"
|
||||
#include "minddata/dataset/audio/ir/kernels/bandreject_biquad_ir.h"
|
||||
#include "minddata/dataset/audio/ir/kernels/bass_biquad_ir.h"
|
||||
#include "minddata/dataset/audio/ir/kernels/frequency_masking_ir.h"
|
||||
#include "minddata/dataset/audio/ir/kernels/time_masking_ir.h"
|
||||
#include "minddata/dataset/audio/ir/kernels/time_stretch_ir.h"
|
||||
|
||||
|
@ -135,6 +136,27 @@ std::shared_ptr<TensorOperation> BassBiquad::Parse() {
|
|||
return std::make_shared<BassBiquadOperation>(data_->sample_rate_, data_->gain_, data_->central_freq_, data_->Q_);
|
||||
}
|
||||
|
||||
// FrequencyMasking Transform Operation.
|
||||
struct FrequencyMasking::Data {
|
||||
Data(bool iid_masks, int32_t frequency_mask_param, int32_t mask_start, double mask_value)
|
||||
: iid_masks_(iid_masks),
|
||||
frequency_mask_param_(frequency_mask_param),
|
||||
mask_start_(mask_start),
|
||||
mask_value_(mask_value) {}
|
||||
int32_t frequency_mask_param_;
|
||||
int32_t mask_start_;
|
||||
bool iid_masks_;
|
||||
double mask_value_;
|
||||
};
|
||||
|
||||
FrequencyMasking::FrequencyMasking(bool iid_masks, int32_t frequency_mask_param, int32_t mask_start, double mask_value)
|
||||
: data_(std::make_shared<Data>(iid_masks, frequency_mask_param, mask_start, mask_value)) {}
|
||||
|
||||
std::shared_ptr<TensorOperation> FrequencyMasking::Parse() {
|
||||
return std::make_shared<FrequencyMaskingOperation>(data_->iid_masks_, data_->frequency_mask_param_,
|
||||
data_->mask_start_, data_->mask_value_);
|
||||
}
|
||||
|
||||
// TimeMasking Transform Operation.
|
||||
struct TimeMasking::Data {
|
||||
Data(bool iid_masks, int64_t time_mask_param, int64_t mask_start, double mask_value)
|
||||
|
|
|
@ -24,6 +24,7 @@
|
|||
#include "minddata/dataset/audio/ir/kernels/bandpass_biquad_ir.h"
|
||||
#include "minddata/dataset/audio/ir/kernels/bandreject_biquad_ir.h"
|
||||
#include "minddata/dataset/audio/ir/kernels/bass_biquad_ir.h"
|
||||
#include "minddata/dataset/audio/ir/kernels/frequency_masking_ir.h"
|
||||
#include "minddata/dataset/audio/ir/kernels/time_masking_ir.h"
|
||||
#include "minddata/dataset/audio/ir/kernels/time_stretch_ir.h"
|
||||
#include "minddata/dataset/include/dataset/transforms.h"
|
||||
|
@ -115,6 +116,19 @@ PYBIND_REGISTER(
|
|||
}));
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(
|
||||
FrequencyMaskingOperation, 1, ([](const py::module *m) {
|
||||
(void)
|
||||
py::class_<audio::FrequencyMaskingOperation, TensorOperation, std::shared_ptr<audio::FrequencyMaskingOperation>>(
|
||||
*m, "FrequencyMaskingOperation")
|
||||
.def(py::init([](bool iid_masks, int32_t frequency_mask_param, int32_t mask_start, double mask_value) {
|
||||
auto frequency_masking =
|
||||
std::make_shared<audio::FrequencyMaskingOperation>(iid_masks, frequency_mask_param, mask_start, mask_value);
|
||||
THROW_IF_ERROR(frequency_masking->ValidateParams());
|
||||
return frequency_masking;
|
||||
}));
|
||||
}));
|
||||
|
||||
PYBIND_REGISTER(
|
||||
TimeMaskingOperation, 1, ([](const py::module *m) {
|
||||
(void)py::class_<audio::TimeMaskingOperation, TensorOperation, std::shared_ptr<audio::TimeMaskingOperation>>(
|
||||
|
|
|
@ -9,6 +9,7 @@ add_library(audio-ir-kernels OBJECT
|
|||
bandpass_biquad_ir.cc
|
||||
bandreject_biquad_ir.cc
|
||||
bass_biquad_ir.cc
|
||||
frequency_masking_ir.cc
|
||||
time_masking_ir.cc
|
||||
time_stretch_ir.cc
|
||||
)
|
||||
|
|
|
@ -0,0 +1,61 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "minddata/dataset/audio/ir/kernels/frequency_masking_ir.h"
|
||||
#include "minddata/dataset/audio/kernels/frequency_masking_op.h"
|
||||
#include "minddata/dataset/audio/ir/validators.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
namespace audio {
|
||||
|
||||
FrequencyMaskingOperation::FrequencyMaskingOperation(bool iid_masks, int32_t frequency_mask_param, int32_t mask_start,
|
||||
double mask_value)
|
||||
: iid_masks_(iid_masks),
|
||||
frequency_mask_param_(frequency_mask_param),
|
||||
mask_start_(mask_start),
|
||||
mask_value_(mask_value) {}
|
||||
|
||||
FrequencyMaskingOperation::~FrequencyMaskingOperation() = default;
|
||||
|
||||
Status FrequencyMaskingOperation::ValidateParams() {
|
||||
RETURN_IF_NOT_OK(CheckIntScalarNonNegative("FrequencyMasking", "frequency_mask_param", frequency_mask_param_));
|
||||
RETURN_IF_NOT_OK(CheckIntScalarNonNegative("FrequencyMasking", "mask_start", mask_start_));
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
std::shared_ptr<TensorOp> FrequencyMaskingOperation::Build() {
|
||||
std::shared_ptr<FrequencyMaskingOp> tensor_op =
|
||||
std::make_shared<FrequencyMaskingOp>(iid_masks_, frequency_mask_param_, mask_start_, mask_value_);
|
||||
return tensor_op;
|
||||
}
|
||||
|
||||
std::string FrequencyMaskingOperation::Name() const { return kFrequencyMaskingOperation; }
|
||||
|
||||
Status FrequencyMaskingOperation::to_json(nlohmann::json *out_json) {
|
||||
nlohmann::json args;
|
||||
args["frequency_mask_param"] = frequency_mask_param_;
|
||||
args["mask_start"] = mask_start_;
|
||||
args["iid_masks"] = iid_masks_;
|
||||
args["mask_value"] = mask_value_;
|
||||
*out_json = args;
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace audio
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,56 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_IR_KERNELS_FREQUENCY_MASKING_IR_H_
|
||||
#define MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_IR_KERNELS_FREQUENCY_MASKING_IR_H_
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "include/api/status.h"
|
||||
#include "minddata/dataset/kernels/ir/tensor_operation.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
namespace audio {
|
||||
|
||||
constexpr char kFrequencyMaskingOperation[] = "FrequencyMasking";
|
||||
|
||||
class FrequencyMaskingOperation : public TensorOperation {
|
||||
public:
|
||||
FrequencyMaskingOperation(bool iid_masks, int32_t frequency_mask_param, int32_t mask_start, double mask_value);
|
||||
|
||||
~FrequencyMaskingOperation();
|
||||
|
||||
std::shared_ptr<TensorOp> Build() override;
|
||||
|
||||
Status ValidateParams() override;
|
||||
|
||||
std::string Name() const override;
|
||||
|
||||
Status to_json(nlohmann::json *out_json) override;
|
||||
|
||||
private:
|
||||
int32_t frequency_mask_param_;
|
||||
int32_t mask_start_;
|
||||
bool iid_masks_;
|
||||
double mask_value_;
|
||||
}; // class FrequencyMaskingOperation
|
||||
|
||||
} // namespace audio
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_IR_KERNELS_FREQUENCY_MASKING_IR_H_
|
|
@ -10,6 +10,7 @@ add_library(audio-kernels OBJECT
|
|||
bandpass_biquad_op.cc
|
||||
bandreject_biquad_op.cc
|
||||
bass_biquad_op.cc
|
||||
frequency_masking_op.cc
|
||||
time_masking_op.cc
|
||||
time_stretch_op.cc
|
||||
)
|
||||
|
|
|
@ -399,8 +399,7 @@ Status RandomMaskAlongAxis(const std::shared_ptr<Tensor> &input, std::shared_ptr
|
|||
Status MaskAlongAxis(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int64_t mask_width,
|
||||
int64_t mask_start, double mask_value, int axis) {
|
||||
if (axis != 2 && axis != 1) {
|
||||
RETURN_STATUS_UNEXPECTED(
|
||||
"MaskAlongAxis: only support Time and Frequency masking, the axis should be equal to 1 or 2.");
|
||||
RETURN_STATUS_UNEXPECTED("MaskAlongAxis: only support Time and Frequency masking, axis should be 1 or 2.");
|
||||
}
|
||||
TensorShape input_shape = input->shape();
|
||||
// squeeze input
|
||||
|
@ -409,9 +408,9 @@ Status MaskAlongAxis(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tenso
|
|||
|
||||
int check_dim_ind = (axis == 1) ? -2 : -1;
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(0 <= mask_start && mask_start <= input_shape[check_dim_ind],
|
||||
"MaskAlongAxis: mask_start should be smaller than the length of chosen dim.");
|
||||
"MaskAlongAxis: mask_start should be less than the length of chosen dimension.");
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(mask_start + mask_width <= input_shape[check_dim_ind],
|
||||
"MaskAlongAxis: mask_width with mask_start is out of bounds.");
|
||||
"MaskAlongAxis: the sum of mask_start and mask_width is out of bounds.");
|
||||
|
||||
int64_t cell_size = input->type().SizeInBytes();
|
||||
|
||||
|
|
|
@ -208,7 +208,6 @@ Status RandomMaskAlongAxis(const std::shared_ptr<Tensor> &input, std::shared_ptr
|
|||
/// \return Status code
|
||||
Status MaskAlongAxis(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int64_t mask_width,
|
||||
int64_t mask_start, double mask_value, int axis);
|
||||
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_AUDIO_UTILS_H_
|
||||
|
|
|
@ -0,0 +1,66 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "minddata/dataset/audio/kernels/frequency_masking_op.h"
|
||||
|
||||
#include "minddata/dataset/audio/kernels/audio_utils.h"
|
||||
#include "minddata/dataset/kernels/data/data_utils.h"
|
||||
#include "minddata/dataset/util/random.h"
|
||||
#include "minddata/dataset/util/status.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
// constructor
|
||||
FrequencyMaskingOp::FrequencyMaskingOp(bool iid_masks, int32_t frequency_mask_param, int32_t mask_start,
|
||||
double mask_value)
|
||||
: frequency_mask_param_(frequency_mask_param),
|
||||
mask_start_(mask_start),
|
||||
iid_masks_(iid_masks),
|
||||
mask_value_(mask_value) {
|
||||
rnd_.seed(GetSeed());
|
||||
}
|
||||
|
||||
// main function
|
||||
Status FrequencyMaskingOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
|
||||
IO_CHECK(input, output);
|
||||
// input <..., freq, time>
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(input->Rank() >= 2,
|
||||
"FrequencyMasking: input tensor is not in shape of <..., freq, time>.");
|
||||
TensorShape input_shape = input->shape();
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(
|
||||
input_shape[-2] >= frequency_mask_param_,
|
||||
"FrequencyMasking: frequency_mask_param should be less than the length of frequency dimension.");
|
||||
|
||||
std::shared_ptr<Tensor> input_tensor;
|
||||
// typecast
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(input->type() != DataType::DE_STRING,
|
||||
"FrequencyMasking: input tensor type should be float, but got string.");
|
||||
if (input->type() != DataType::DE_FLOAT64) {
|
||||
RETURN_IF_NOT_OK(TypeCast(input, &input_tensor, DataType(DataType::DE_FLOAT32)));
|
||||
} else {
|
||||
input_tensor = input;
|
||||
}
|
||||
auto mask_val =
|
||||
input->type() != DataType::DE_FLOAT64 ? static_cast<float>(mask_value_) : static_cast<double>(mask_value_);
|
||||
// iid_masks - whether to apply different masks to each example/channel.
|
||||
if (iid_masks_ == false) {
|
||||
return MaskAlongAxis(input_tensor, output, frequency_mask_param_, mask_start_, mask_val, 1);
|
||||
} else {
|
||||
return RandomMaskAlongAxis(input_tensor, output, frequency_mask_param_, mask_val, 1, rnd_);
|
||||
}
|
||||
}
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,52 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_FREQUENCY_MASKING_OP_H_
|
||||
#define MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_FREQUENCY_MASKING_OP_H_
|
||||
|
||||
#include <memory>
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "minddata/dataset/core/tensor.h"
|
||||
#include "minddata/dataset/kernels/tensor_op.h"
|
||||
#include "minddata/dataset/util/status.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace dataset {
|
||||
|
||||
class FrequencyMaskingOp : public TensorOp {
|
||||
public:
|
||||
explicit FrequencyMaskingOp(bool iid_masks = false, int32_t frequency_mask_param = 0, int32_t mask_start = 0,
|
||||
double mask_value_ = 0.0);
|
||||
|
||||
~FrequencyMaskingOp() override = default;
|
||||
|
||||
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
|
||||
|
||||
std::string Name() const override { return kFrequencyMaskingOp; }
|
||||
|
||||
private:
|
||||
bool iid_masks_;
|
||||
int32_t frequency_mask_param_;
|
||||
int32_t mask_start_;
|
||||
double mask_value_;
|
||||
std::mt19937 rnd_;
|
||||
};
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_FREQUENCY_MASKING_OP_H_
|
|
@ -36,7 +36,7 @@ Status TimeMaskingOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_
|
|||
CHECK_FAIL_RETURN_UNEXPECTED(input->Rank() >= 2, "TimeMasking: input dimension must be greater than 2.");
|
||||
TensorShape input_shape = input->shape();
|
||||
CHECK_FAIL_RETURN_UNEXPECTED(input_shape[-1] >= time_mask_param_,
|
||||
"TimeMasking: input time_mask_param should be smaller than the length of time dim.");
|
||||
"TimeMasking: time_mask_param should be less than the length of time dimension.");
|
||||
|
||||
std::shared_ptr<Tensor> input_tensor;
|
||||
// typecast
|
||||
|
|
|
@ -187,19 +187,22 @@ class BassBiquad final : public TensorTransform {
|
|||
std::shared_ptr<Data> data_;
|
||||
};
|
||||
|
||||
/// \brief TimeStretch TensorTransform
|
||||
/// \notes Stretch STFT in time at a given rate, without changing the pitch.
|
||||
class TimeStretch final : public TensorTransform {
|
||||
/// \brief FrequencyMasking TensorTransform.
|
||||
/// \notes Apply masking to a spectrogram in the frequency domain.
|
||||
class FrequencyMasking final : public TensorTransform {
|
||||
public:
|
||||
/// \brief Constructor.
|
||||
/// \param[in] hop_length Length of hop between STFT windows. Default: None.
|
||||
/// \param[in] n_freq Number of filter banks form STFT. Default: 201.
|
||||
/// \param[in] fixed_rate Rate to speed up or slow down the input in time. Default: None.
|
||||
explicit TimeStretch(float hop_length = std::numeric_limits<float>::quiet_NaN(), int n_freq = 201,
|
||||
float fixed_rate = std::numeric_limits<float>::quiet_NaN());
|
||||
/// \param[in] iid_masks Whether to apply different masks to each example.
|
||||
/// \param[in] frequency_mask_param Maximum possible length of the mask.
|
||||
/// Indices uniformly sampled from [0, frequency_mask_param].
|
||||
/// Mask width when iid_masks=true.
|
||||
/// \param[in] mask_start Mask start when iid_masks=true.
|
||||
/// \param[in] mask_value Mask value.
|
||||
explicit FrequencyMasking(bool iid_masks = false, int32_t frequency_mask_param = 0, int32_t mask_start = 0,
|
||||
double mask_value = 0.0);
|
||||
|
||||
/// \brief Destructor.
|
||||
~TimeStretch() = default;
|
||||
~FrequencyMasking() = default;
|
||||
|
||||
protected:
|
||||
/// \brief Function to convert TensorTransform object into a TensorOperation object.
|
||||
|
@ -237,6 +240,30 @@ class TimeMasking final : public TensorTransform {
|
|||
struct Data;
|
||||
std::shared_ptr<Data> data_;
|
||||
};
|
||||
|
||||
/// \brief TimeStretch TensorTransform
|
||||
/// \notes Stretch STFT in time at a given rate, without changing the pitch.
|
||||
class TimeStretch final : public TensorTransform {
|
||||
public:
|
||||
/// \brief Constructor.
|
||||
/// \param[in] hop_length Length of hop between STFT windows. Default: None.
|
||||
/// \param[in] n_freq Number of filter banks form STFT. Default: 201.
|
||||
/// \param[in] fixed_rate Rate to speed up or slow down the input in time. Default: None.
|
||||
explicit TimeStretch(float hop_length = std::numeric_limits<float>::quiet_NaN(), int n_freq = 201,
|
||||
float fixed_rate = std::numeric_limits<float>::quiet_NaN());
|
||||
|
||||
/// \brief Destructor.
|
||||
~TimeStretch() = default;
|
||||
|
||||
protected:
|
||||
/// \brief Function to convert TensorTransform object into a TensorOperation object.
|
||||
/// \return Shared pointer to TensorOperation object.
|
||||
std::shared_ptr<TensorOperation> Parse() override;
|
||||
|
||||
private:
|
||||
struct Data;
|
||||
std::shared_ptr<Data> data_;
|
||||
};
|
||||
} // namespace audio
|
||||
} // namespace dataset
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -145,6 +145,7 @@ constexpr char kBandBiquadOp[] = "BandBiquadOp";
|
|||
constexpr char kBandpassBiquadOp[] = "BandpassBiquadOp";
|
||||
constexpr char kBandrejectBiquadOp[] = "BandrejectBiquadOp";
|
||||
constexpr char kBassBiquadOp[] = "BassBiquadOp";
|
||||
constexpr char kFrequencyMaskingOp[] = "FrequencyMaskingOp";
|
||||
constexpr char kTimeMaskingOp[] = "TimeMaskingOp";
|
||||
constexpr char kTimeStretchOp[] = "TimeStretchOp";
|
||||
|
||||
|
|
|
@ -251,37 +251,37 @@ class BassBiquad(AudioTensorOperation):
|
|||
return cde.BassBiquadOperation(self.sample_rate, self.gain, self.central_freq, self.Q)
|
||||
|
||||
|
||||
class TimeStretch(AudioTensorOperation):
|
||||
class FrequencyMasking(AudioTensorOperation):
|
||||
"""
|
||||
Stretch STFT in time at a given rate, without changing the pitch.
|
||||
Apply masking to a spectrogram in the frequency domain.
|
||||
|
||||
Args:
|
||||
hop_length (int, optional): Length of hop between STFT windows (default=None).
|
||||
n_freq (int, optional): Number of filter banks form STFT (default=201).
|
||||
fixed_rate (float, optional): Rate to speed up or slow down the input in time (default=None).
|
||||
iid_masks (bool, optional): Whether to apply different masks to each example (default=false).
|
||||
frequency_mask_param (int): Maximum possible length of the mask (default=0).
|
||||
Indices uniformly sampled from [0, frequency_mask_param].
|
||||
mask_start (int): Mask start when iid_masks=true (default=0).
|
||||
mask_value (double): Mask value (default=0.0).
|
||||
|
||||
Examples:
|
||||
>>> freq = 44100
|
||||
>>> num_frame = 30
|
||||
>>> def gen():
|
||||
... np.random.seed(0)
|
||||
... data = np.random.random([freq, num_frame])
|
||||
... yield (np.array(data, dtype=np.float32), )
|
||||
>>> data1 = ds.GeneratorDataset(source=gen, column_names=["multi_dimensional_data"])
|
||||
>>> transforms = [py_audio.TimeStretch()]
|
||||
>>> data1 = data1.map(operations=transforms, input_columns=["multi_dimensional_data"])
|
||||
... random.seed(0)
|
||||
... data = numpy.random.random([1, 3, 2])
|
||||
... yield (numpy.array(data, dtype=numpy.float32),)
|
||||
>>> dataset = ds.GeneratorDataset(source=gen,
|
||||
... column_names=["multi_dim_data"])
|
||||
>>> dataset = dataset.map(operations=FrequencyMasking(frequency_mask_param=1),
|
||||
... input_columns=["multi_dim_data"])
|
||||
"""
|
||||
@check_time_stretch
|
||||
def __init__(self, hop_length=None, n_freq=201, fixed_rate=None):
|
||||
self.n_freq = n_freq
|
||||
self.fixed_rate = fixed_rate
|
||||
|
||||
n_fft = (n_freq - 1) * 2
|
||||
self.hop_length = hop_length if hop_length is not None else n_fft // 2
|
||||
self.fixed_rate = fixed_rate if fixed_rate is not None else np.nan
|
||||
@check_masking
|
||||
def __init__(self, iid_masks=False, frequency_mask_param=0, mask_start=0, mask_value=0.0):
|
||||
self.iid_masks = iid_masks
|
||||
self.frequency_mask_param = frequency_mask_param
|
||||
self.mask_start = mask_start
|
||||
self.mask_value = mask_value
|
||||
|
||||
def parse(self):
|
||||
return cde.TimeStretchOperation(self.hop_length, self.n_freq, self.fixed_rate)
|
||||
return cde.FrequencyMaskingOperation(self.iid_masks, self.frequency_mask_param, self.mask_start,
|
||||
self.mask_value)
|
||||
|
||||
|
||||
class TimeMasking(AudioTensorOperation):
|
||||
|
@ -314,3 +314,36 @@ class TimeMasking(AudioTensorOperation):
|
|||
|
||||
def parse(self):
|
||||
return cde.TimeMaskingOperation(self.iid_masks, self.time_mask_param, self.mask_start, self.mask_value)
|
||||
|
||||
|
||||
class TimeStretch(AudioTensorOperation):
|
||||
"""
|
||||
Stretch STFT in time at a given rate, without changing the pitch.
|
||||
|
||||
Args:
|
||||
hop_length (int, optional): Length of hop between STFT windows (default=None).
|
||||
n_freq (int, optional): Number of filter banks form STFT (default=201).
|
||||
fixed_rate (float, optional): Rate to speed up or slow down the input in time (default=None).
|
||||
|
||||
Examples:
|
||||
>>> freq = 44100
|
||||
>>> num_frame = 30
|
||||
>>> def gen():
|
||||
... np.random.seed(0)
|
||||
... data = np.random.random([freq, num_frame])
|
||||
... yield (np.array(data, dtype=np.float32), )
|
||||
>>> data1 = ds.GeneratorDataset(source=gen, column_names=["multi_dimensional_data"])
|
||||
>>> transforms = [py_audio.TimeStretch()]
|
||||
>>> data1 = data1.map(operations=transforms, input_columns=["multi_dimensional_data"])
|
||||
"""
|
||||
@check_time_stretch
|
||||
def __init__(self, hop_length=None, n_freq=201, fixed_rate=None):
|
||||
self.n_freq = n_freq
|
||||
self.fixed_rate = fixed_rate
|
||||
|
||||
n_fft = (n_freq - 1) * 2
|
||||
self.hop_length = hop_length if hop_length is not None else n_fft // 2
|
||||
self.fixed_rate = fixed_rate if fixed_rate is not None else np.nan
|
||||
|
||||
def parse(self):
|
||||
return cde.TimeStretchOperation(self.hop_length, self.n_freq, self.fixed_rate)
|
||||
|
|
|
@ -167,6 +167,26 @@ def check_bass_biquad(method):
|
|||
return new_method
|
||||
|
||||
|
||||
def check_masking(method):
|
||||
"""Wrapper method to check the parameters of time_masking and frequency_masking"""
|
||||
|
||||
@wraps(method)
|
||||
def new_method(self, *args, **kwargs):
|
||||
[iid_masks, mask_param, mask_start, mask_value], _ = parse_user_args(
|
||||
method, *args, **kwargs)
|
||||
type_check(iid_masks, (bool,), "iid_masks")
|
||||
type_check(mask_param, (int,), "mask_param")
|
||||
check_value(mask_param, (0, FLOAT_MAX_INTEGER), "mask_param")
|
||||
type_check(mask_start, (int,), "mask_start")
|
||||
check_value(mask_start, (0, FLOAT_MAX_INTEGER), "mask_start")
|
||||
type_check(mask_value, (int, float), "mask_value")
|
||||
check_value(mask_value, (0, DOUBLE_MAX_INTEGER), "mask_value")
|
||||
|
||||
return method(self, *args, **kwargs)
|
||||
|
||||
return new_method
|
||||
|
||||
|
||||
def check_time_stretch(method):
|
||||
"""Wrapper method to check the parameters of time_stretch."""
|
||||
@wraps(method)
|
||||
|
@ -186,22 +206,3 @@ def check_time_stretch(method):
|
|||
return method(self, *args, **kwargs)
|
||||
|
||||
return new_method
|
||||
|
||||
|
||||
def check_masking(method):
|
||||
"""Wrapper method to check the parameters of time_masking and frequency_masking"""
|
||||
|
||||
@wraps(method)
|
||||
def new_method(self, *args, **kwargs):
|
||||
[iid_masks, mask_param, mask_start, mask_value], _ = parse_user_args(
|
||||
method, *args, **kwargs)
|
||||
type_check(iid_masks, (bool,), "iid_masks")
|
||||
type_check(mask_param, (int,), "mask_param")
|
||||
check_value(mask_param, (0, FLOAT_MAX_INTEGER), "mask_param")
|
||||
type_check(mask_start, (int,), "mask_start")
|
||||
check_value(mask_start, (0, FLOAT_MAX_INTEGER), "mask_start")
|
||||
type_check(mask_value, (int, float), "mask_value")
|
||||
check_value(mask_value, (0, DOUBLE_MAX_INTEGER), "mask_value")
|
||||
return method(self, *args, **kwargs)
|
||||
|
||||
return new_method
|
||||
|
|
|
@ -19,6 +19,8 @@
|
|||
|
||||
#include "minddata/dataset/include/dataset/audio.h"
|
||||
#include "minddata/dataset/include/dataset/datasets.h"
|
||||
#include "minddata/dataset/include/dataset/execute.h"
|
||||
#include "minddata/dataset/include/dataset/transforms.h"
|
||||
|
||||
using namespace mindspore::dataset;
|
||||
using mindspore::LogStream;
|
||||
|
@ -487,3 +489,64 @@ TEST_F(MindDataTestPipeline, TestAnglePipelineError) {
|
|||
std::unordered_map<std::string, mindspore::MSTensor> row;
|
||||
EXPECT_ERROR(iter->GetNextRow(&row));
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestFrequencyMaskingPipeline) {
|
||||
MS_LOG(INFO) << "Doing TestFrequencyMasking Pipeline.";
|
||||
// Original waveform
|
||||
std::shared_ptr<SchemaObj> schema = Schema();
|
||||
ASSERT_OK(schema->add_column("inputData", mindspore::DataType::kNumberTypeFloat32, {200, 200}));
|
||||
std::shared_ptr<Dataset> ds = RandomData(50, schema);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
ds = ds->SetNumWorkers(4);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
auto frequencymasking = audio::FrequencyMasking(true, 6);
|
||||
|
||||
ds = ds->Map({frequencymasking});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Filtered waveform by bandbiquad
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
std::unordered_map<std::string, mindspore::MSTensor> row;
|
||||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
|
||||
std::vector<int64_t> expected = {200, 200};
|
||||
|
||||
int i = 0;
|
||||
while (row.size() != 0) {
|
||||
auto col = row["inputData"];
|
||||
ASSERT_EQ(col.Shape(), expected);
|
||||
ASSERT_EQ(col.Shape().size(), 2);
|
||||
ASSERT_EQ(col.DataType(), mindspore::DataType::kNumberTypeFloat32);
|
||||
ASSERT_OK(iter->GetNextRow(&row));
|
||||
i++;
|
||||
}
|
||||
EXPECT_EQ(i, 50);
|
||||
|
||||
iter->Stop();
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestPipeline, TestFrequencyMaskingWrongArgs) {
|
||||
MS_LOG(INFO) << "Doing TestFrequencyMasking with wrong args.";
|
||||
// Original waveform
|
||||
std::shared_ptr<SchemaObj> schema = Schema();
|
||||
ASSERT_OK(schema->add_column("inputData", mindspore::DataType::kNumberTypeFloat32, {20, 20}));
|
||||
std::shared_ptr<Dataset> ds = RandomData(50, schema);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
ds = ds->SetNumWorkers(4);
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
auto frequencymasking = audio::FrequencyMasking(true, -100);
|
||||
|
||||
ds = ds->Map({frequencymasking});
|
||||
EXPECT_NE(ds, nullptr);
|
||||
|
||||
// Filtered waveform by bandbiquad
|
||||
std::shared_ptr<Iterator> iter = ds->CreateIterator();
|
||||
// Expect failure
|
||||
EXPECT_EQ(iter, nullptr);
|
||||
}
|
||||
|
|
|
@ -197,6 +197,19 @@ TEST_F(MindDataTestExecute, TestCrop) {
|
|||
EXPECT_EQ(image.Shape()[1], 15);
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestExecute, TestFrequencyMasking) {
|
||||
MS_LOG(INFO) << "Doing TestFrequencyMasking.";
|
||||
std::shared_ptr<Tensor> input_tensor_;
|
||||
TensorShape s = TensorShape({6, 2});
|
||||
ASSERT_OK(Tensor::CreateFromVector(
|
||||
std::vector<float>({1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f}), s, &input_tensor_));
|
||||
auto input_tensor = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(input_tensor_));
|
||||
std::shared_ptr<TensorTransform> frequency_masking_op = std::make_shared<audio::FrequencyMasking>(true, 2);
|
||||
mindspore::dataset::Execute transform({frequency_masking_op});
|
||||
Status status = transform(input_tensor, &input_tensor);
|
||||
EXPECT_TRUE(status.IsOk());
|
||||
}
|
||||
|
||||
TEST_F(MindDataTestExecute, TestTimeMasking) {
|
||||
MS_LOG(INFO) << "Doing TestTimeMasking.";
|
||||
std::shared_ptr<Tensor> input_tensor_;
|
||||
|
|
|
@ -0,0 +1,137 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""
|
||||
Testing FrequencyMasking op in DE.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.audio.transforms as atf
|
||||
from mindspore import log as logger
|
||||
|
||||
|
||||
CHANNEL = 2
|
||||
FREQ = 30
|
||||
TIME = 30
|
||||
|
||||
|
||||
def gen(shape):
|
||||
np.random.seed(0)
|
||||
data = np.random.random(shape)
|
||||
yield(np.array(data, dtype=np.float32),)
|
||||
|
||||
|
||||
def _count_unequal_element(data_expected, data_me, rtol, atol):
|
||||
""" Precision calculation func """
|
||||
assert data_expected.shape == data_me.shape
|
||||
total_count = len(data_expected.flatten())
|
||||
error = np.abs(data_expected - data_me)
|
||||
greater = np.greater(error, atol + np.abs(data_expected) * rtol)
|
||||
loss_count = np.count_nonzero(greater)
|
||||
assert (loss_count / total_count) < rtol, \
|
||||
"\ndata_expected_std:{0}\ndata_me_error:{1}\nloss:{2}". \
|
||||
format(data_expected[greater], data_me[greater], error[greater])
|
||||
|
||||
|
||||
def allclose_nparray(data_expected, data_me, rtol, atol, equal_nan=True):
|
||||
""" Precision calculation formula """
|
||||
if np.any(np.isnan(data_expected)):
|
||||
assert np.allclose(data_me, data_expected, rtol, atol, equal_nan=equal_nan)
|
||||
elif not np.allclose(data_me, data_expected, rtol, atol, equal_nan=equal_nan):
|
||||
_count_unequal_element(data_expected, data_me, rtol, atol)
|
||||
else:
|
||||
assert True
|
||||
|
||||
|
||||
def test_func_frequency_masking_eager_random_input():
|
||||
""" mindspore eager mode normal testcase:frequency_masking op"""
|
||||
logger.info("test frequency_masking op")
|
||||
spectrogram = next(gen((CHANNEL, FREQ, TIME)))[0]
|
||||
out_put = atf.FrequencyMasking(False, 3, 1, 10)(spectrogram)
|
||||
assert out_put.shape == (CHANNEL, FREQ, TIME)
|
||||
|
||||
|
||||
def test_func_frequency_masking_eager_precision():
|
||||
""" mindspore eager mode normal testcase:frequency_masking op"""
|
||||
logger.info("test frequency_masking op")
|
||||
spectrogram = np.array([[[0.17274511, 0.85174704, 0.07162686, -0.45436913],
|
||||
[-1.045921, -1.8204843, 0.62333095, -0.09532598],
|
||||
[1.8175547, -0.25779432, -0.58152324, -0.00221091]],
|
||||
[[-1.205032, 0.18922766, -0.5277673, -1.3090396],
|
||||
[1.8914849, -0.97001046, -0.23726775, 0.00525892],
|
||||
[-1.0271876, 0.33526883, 1.7413973, 0.12313101]]]).astype(np.float32)
|
||||
out_ms = atf.FrequencyMasking(False, 2, 0, 0)(spectrogram)
|
||||
out_benchmark = np.array([[[0.0, 0.0, 0.0, 0.0],
|
||||
[0.0, 0.0, 0.0, 0.0],
|
||||
[1.8175547, -0.25779432, -0.58152324, -0.00221091]],
|
||||
[[0.0, 0.0, 0.0, 0.0],
|
||||
[0.0, 0.0, 0.0, 0.0],
|
||||
[-1.0271876, 0.33526883, 1.7413973, 0.12313101]]]).astype(np.float32)
|
||||
allclose_nparray(out_ms, out_benchmark, 0.0001, 0.0001)
|
||||
|
||||
|
||||
def test_func_frequency_masking_pipeline():
|
||||
""" mindspore pipeline mode normal testcase:frequency_masking op"""
|
||||
logger.info("test frequency_masking op, pipeline")
|
||||
|
||||
generator = gen([CHANNEL, FREQ, TIME])
|
||||
data1 = ds.GeneratorDataset(source=generator, column_names=[
|
||||
"multi_dimensional_data"])
|
||||
|
||||
transforms = [
|
||||
atf.FrequencyMasking(True, 8)
|
||||
]
|
||||
data1 = data1.map(operations=transforms, input_columns=[
|
||||
"multi_dimensional_data"])
|
||||
|
||||
for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True):
|
||||
out_put = item["multi_dimensional_data"]
|
||||
assert out_put.shape == (CHANNEL, FREQ, TIME)
|
||||
|
||||
|
||||
def test_frequency_masking_invalid_input():
|
||||
def test_invalid_param(test_name, iid_masks, frequency_mask_param, mask_start, error, error_msg):
|
||||
logger.info("Test FrequencyMasking with wrong params: {0}".format(test_name))
|
||||
with pytest.raises(error) as error_info:
|
||||
atf.FrequencyMasking(iid_masks, frequency_mask_param, mask_start)
|
||||
assert error_msg in str(error_info.value)
|
||||
|
||||
def test_invalid_input(test_name, iid_masks, frequency_mask_param, mask_start, error, error_msg):
|
||||
logger.info("Test FrequencyMasking with wrong params: {0}".format(test_name))
|
||||
with pytest.raises(error) as error_info:
|
||||
spectrogram = next(gen((CHANNEL, FREQ, TIME)))[0]
|
||||
_ = atf.FrequencyMasking(iid_masks, frequency_mask_param, mask_start)(spectrogram)
|
||||
assert error_msg in str(error_info.value)
|
||||
|
||||
test_invalid_param("invalid mask_start", True, 2, -10, ValueError,
|
||||
"Input mask_start is not within the required interval of [0, 16777216].")
|
||||
test_invalid_param("invalid mask_param", True, -2, 10, ValueError,
|
||||
"Input mask_param is not within the required interval of [0, 16777216].")
|
||||
test_invalid_param("invalid iid_masks", "True", 2, 10, TypeError,
|
||||
"Argument iid_masks with value True is not of type [<class 'bool'>], but got <class 'str'>.")
|
||||
|
||||
test_invalid_input("invalid mask_start", False, 2, 100, RuntimeError,
|
||||
"MaskAlongAxis: mask_start should be less than the length of chosen dimension.")
|
||||
test_invalid_input("invalid mask_width", False, 200, 2, RuntimeError,
|
||||
"FrequencyMasking: frequency_mask_param should be less than the length of frequency dimension.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_func_frequency_masking_eager_random_input()
|
||||
test_func_frequency_masking_eager_precision()
|
||||
test_func_frequency_masking_pipeline()
|
||||
test_frequency_masking_invalid_input()
|
|
@ -125,9 +125,9 @@ def test_time_masking_invalid_input():
|
|||
"Argument iid_masks with value True is not of type [<class 'bool'>], but got <class 'str'>.")
|
||||
|
||||
test_invalid_input("invalid mask_start", False, 2, 100, RuntimeError,
|
||||
"MaskAlongAxis: mask_start should be smaller than the length of chosen dim.")
|
||||
"MaskAlongAxis: mask_start should be less than the length of chosen dimension.")
|
||||
test_invalid_input("invalid mask_width", False, 200, 2, RuntimeError,
|
||||
"TimeMasking: input time_mask_param should be smaller than the length of time dim.")
|
||||
"TimeMasking: time_mask_param should be less than the length of time dimension.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
Loading…
Reference in New Issue